Source code for ice.health_index_estimation.datasets
from ice.base import BaseDataset
import os
import pandas as pd
from copy import copy
from scipy import interpolate
[docs]class Milling(BaseDataset):
"""
Preprocessed to mil data from the Milling Data Set:
https://data.nasa.gov/Raw-Data/Milling-Wear/vjv9-9f3x/data
"""
[docs] def set_name_public_link(self):
self.name = "milling"
self.public_link = "https://disk.yandex.ru/d/jnYLUicx6TIkVw"
def _load(self, num_chunks, force_download):
"""
Load the dataset in list obects: self.df, self.target, self.test and self.test_target.
16 subdatasets, list index corresponds to a subdataset number in test or train part
Benchmark preparation with fixed train-test and the paper-based interpolation
of missing values
"""
ref_path = f"data/{self.name}/"
if not os.path.exists(ref_path):
os.makedirs(ref_path)
zfile_path = f"data/{self.name}.zip"
url = self._get_url(self.public_link)
if not os.path.exists(zfile_path) or force_download:
self._download_pgbar(url, zfile_path, self.name, num_chunks)
self._extracting_files(zfile_path, "data/")
# test and train subset number of cuts
train_nums = [1, 3, 5, 7, 8, 9, 10, 11, 12, 13]
test_nums = [2, 4, 6]
data = [
self._read_csv_pgbar(
ref_path + f"case_{i+1}.csv", index_col=["run_id", "sample"]
)
for i in range(16)
]
inter_func = []
for i in range(15):
data[i]["material"] = data[i]["material"].astype("float64")
y = data[i].dropna().VB
x = data[i].dropna().time
f = interpolate.interp1d(x, y, assume_sorted=True, fill_value="extrapolate")
data[i].VB = f(data[i].time)
if i == 6:
midpoint = len(data[i]) // 2
data[i]["VB"].iloc[midpoint:].fillna(0.52, inplace=True)
data[i]["VB"].iloc[:midpoint].fillna(0, inplace=True)
else:
data[i] = data[i].fillna(0)
self.df = [
data[i].drop(columns=["VB", "Unnamed: 0", "case", "run"])
for i in train_nums
]
self.target = [data[i]["VB"] for i in train_nums]
self.test = [
data[i].drop(columns=["VB", "Unnamed: 0", "case", "run"]) for i in test_nums
]
self.test_target = [data[i]["VB"] for i in test_nums]