Describe the bug
Trying to use Machine Learning in the self-hosted webapp, as well as in example_WORC.ipynb
fails.
Steps/Code to Reproduce
import pandas as pd
from pathlib import Path
from autorad.external.download_WORC import download_WORCDatabase
# Set where we will save our data and results
base_dir = Path.cwd() / "autorad_tutorial"
data_dir = base_dir / "data"
result_dir = base_dir / "results"
data_dir.mkdir(exist_ok=True, parents=True)
result_dir.mkdir(exist_ok=True, parents=True)
%load_ext autoreload
%autoreload 2
download data (it may take a few minutes)
download_WORCDatabase(
dataset="Desmoid",
data_folder=data_dir,
n_subjects=100,
)
from autorad.utils.preprocessing import get_paths_with_separate_folder_per_case
# create a table with all the paths
paths_df = get_paths_with_separate_folder_per_case(data_dir, relative=True)
paths_df.sample(5)
from autorad.data.dataset import ImageDataset
from autorad.feature_extraction.extractor import FeatureExtractor
import logging
logging.getLogger().setLevel(logging.CRITICAL)
image_dataset = ImageDataset(
paths_df,
ID_colname="ID",
root_dir=data_dir,
)
# Let's take a look at the data, plotting random 10 cases
image_dataset.plot_examples(n=10, window=None)
extractor = FeatureExtractor(image_dataset, extraction_params="MR_default.yaml")
feature_df = extractor.run()
feature_df.head()
label_df = pd.read_csv(data_dir / "labels.csv")
label_df.sample(5)
from autorad.data.dataset import FeatureDataset
merged_feature_df = feature_df.merge(label_df, left_on="ID",
right_on="patient_ID", how="left")
feature_dataset = FeatureDataset(
merged_feature_df,
target="diagnosis",
ID_colname="ID"
)
splits_path = result_dir / "splits.json"
feature_dataset.split(method="train_val_test", save_path=splits_path)
from autorad.models.classifier import MLClassifier
from autorad.training.trainer import Trainer
models = MLClassifier.initialize_default_sklearn_models()
print(models)
trainer = Trainer(
dataset=feature_dataset,
models=models,
result_dir=result_dir,
experiment_name="Fibromatosis_vs_sarcoma_classification",
)
trainer.run_auto_preprocessing(
selection_methods=["boruta"],
oversampling=False,
)
Expected Results
Initialising the trainer and running preprocessing on the features
Actual Results
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [15], in <cell line: 7>()
1 trainer = Trainer(
2 dataset=feature_dataset,
3 models=models,
4 result_dir=result_dir,
5 experiment_name="Fibromatosis_vs_sarcoma_classification",
6 )
----> 7 trainer.run_auto_preprocessing(
8 selection_methods=["boruta"],
9 oversampling=False,
10 )
File ~/AutoRadiomics/autorad/training/trainer.py:78, in Trainer.run_auto_preprocessing(self, oversampling, selection_methods)
70 preprocessor = Preprocessor(
71 normalize=True,
72 feature_selection_method=selection_method,
73 oversampling_method=oversampling_method,
74 )
75 try:
76 preprocessed[selection_method][
77 oversampling_method
---> 78 ] = preprocessor.fit_transform(self.dataset.data)
79 except AssertionError:
80 log.error(
81 f"Preprocessing with {selection_method} and {oversampling_method} failed."
82 )
File ~/AutoRadiomics/autorad/preprocessing/preprocessor.py:66, in Preprocessor.fit_transform(self, data)
64 result_y = {}
65 all_features = X.train.columns.tolist()
---> 66 X_train_trans, y_train_trans = self.pipeline.fit_transform(
67 X.train, y.train
68 )
69 self.selected_features = self.pipeline["select"].selected_features(
70 column_names=all_features
71 )
72 result_X["train"] = pd.DataFrame(
73 X_train_trans, columns=self.selected_features
74 )
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/sklearn/pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File ~/AutoRadiomics/autorad/feature_selection/selector.py:47, in CoreSelector.fit_transform(self, X, y)
44 def fit_transform(
45 self, X: np.ndarray, y: np.ndarray
46 ) -> tuple[np.ndarray, np.ndarray]:
---> 47 self.fit(X, y)
48 return X[:, self.selected_columns], y
File ~/AutoRadiomics/autorad/feature_selection/selector.py:124, in BorutaSelector.fit(self, X, y, verbose)
122 with warnings.catch_warnings():
123 warnings.simplefilter("ignore")
--> 124 model.fit(X, y)
125 self.selected_columns = np.where(model.support_)[0].tolist()
126 if not self.selected_columns:
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/boruta/boruta_py.py:201, in BorutaPy.fit(self, X, y)
188 def fit(self, X, y):
189 """
190 Fits the Boruta feature selection with the provided estimator.
191
(...)
198 The target values.
199 """
--> 201 return self._fit(X, y)
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/boruta/boruta_py.py:251, in BorutaPy._fit(self, X, y)
249 def _fit(self, X, y):
250 # check input params
--> 251 self._check_params(X, y)
252 self.random_state = check_random_state(self.random_state)
253 # setup variables for Boruta
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/boruta/boruta_py.py:517, in BorutaPy._check_params(self, X, y)
513 """
514 Check hyperparameters as well as X and y before proceeding with fit.
515 """
516 # check X and y are consistent len, X is Array and y is column
--> 517 X, y = check_X_y(X, y)
518 if self.perc <= 0 or self.perc > 100:
519 raise ValueError('The percentile should be between 0 and 100.')
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/sklearn/utils/validation.py:964, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
961 if y is None:
962 raise ValueError("y cannot be None")
--> 964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
967 accept_large_sparse=accept_large_sparse,
968 dtype=dtype,
969 order=order,
970 copy=copy,
971 force_all_finite=force_all_finite,
972 ensure_2d=ensure_2d,
973 allow_nd=allow_nd,
974 ensure_min_samples=ensure_min_samples,
975 ensure_min_features=ensure_min_features,
976 estimator=estimator,
977 )
979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
981 check_consistent_length(X, y)
File ~/miniconda3/envs/AutoRadiomics/lib/python3.10/site-packages/sklearn/utils/validation.py:746, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
749 "Complex data not supported\n{}\n".format(array)
750 ) from complex_warning
ValueError: could not broadcast input array from shape (60,1015) into shape (60,)