Source code for ice.health_index_estimation.datasets

from ice.base import BaseDataset
import os
import pandas as pd

from copy import copy
from scipy import interpolate


[docs]class Milling(BaseDataset):
    """
    Preprocessed to mil data from the Milling Data Set:
    https://data.nasa.gov/Raw-Data/Milling-Wear/vjv9-9f3x/data

    """

[docs]    def set_name_public_link(self):
        self.name = "milling"
        self.public_link = "https://disk.yandex.ru/d/jnYLUicx6TIkVw"

    def _load(self, num_chunks, force_download):
        """
        Load the dataset in list obects: self.df, self.target, self.test and self.test_target.
        16 subdatasets, list index corresponds to a subdataset number in test or train part

        Benchmark preparation with fixed train-test and the paper-based interpolation
        of missing values

        """
        ref_path = f"data/{self.name}/"
        if not os.path.exists(ref_path):
            os.makedirs(ref_path)
        zfile_path = f"data/{self.name}.zip"

        url = self._get_url(self.public_link)
        if not os.path.exists(zfile_path) or force_download:
            self._download_pgbar(url, zfile_path, self.name, num_chunks)

        self._extracting_files(zfile_path, "data/")

        # test and train subset number of cuts
        train_nums = [1, 3, 5, 7, 8, 9, 10, 11, 12, 13]
        test_nums = [2, 4, 6]

        data = [
            self._read_csv_pgbar(
                ref_path + f"case_{i+1}.csv", index_col=["run_id", "sample"]
            )
            for i in range(16)
        ]

        inter_func = []

        for i in range(15):
            data[i]["material"] = data[i]["material"].astype("float64")
            y = data[i].dropna().VB
            x = data[i].dropna().time

            f = interpolate.interp1d(x, y, assume_sorted=True, fill_value="extrapolate")

            data[i].VB = f(data[i].time)
            if i == 6:
                midpoint = len(data[i]) // 2
                data[i]["VB"].iloc[midpoint:].fillna(0.52, inplace=True)
                data[i]["VB"].iloc[:midpoint].fillna(0, inplace=True)
            else:
                data[i] = data[i].fillna(0)

        self.df = [
            data[i].drop(columns=["VB", "Unnamed: 0", "case", "run"])
            for i in train_nums
        ]
        self.target = [data[i]["VB"] for i in train_nums]

        self.test = [
            data[i].drop(columns=["VB", "Unnamed: 0", "case", "run"]) for i in test_nums
        ]
        self.test_target = [data[i]["VB"] for i in test_nums]