When I'm using dask dataframe as input for HyperGBM(v0.2.2) with options:
use_cache=False
and don't specify cache_dir
it gaves the error bellow:
[ERROR] E hypergbm.hyper_gbm.py 584 - FileNotFoundError: [Errno 2] No such file or directory: '/tmp/workdir/hypergbm_cache/22805_16_32f8e48ef673a57e6644a4b1b295fb66_4355a2f3c42cdb4a7b3c3f53ee8a26b5.parquet/part.0.parquet'
[ERROR] Traceback (most recent call last):
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/hyper_gbm.py", line 582, in _save_df
[ERROR] to_parquet(df, filepath, fs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/persistence.py", line 93, in to_parquet
[ERROR] result = dask.compute(parts)
[ERROR] File "/usr/local/lib/python3.7/site-packages/dask/base.py", line 565, in compute
[ERROR] results = schedule(dsk, keys, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 2654, in get
[ERROR] results = self.gather(packed, asynchronous=asynchronous, direct=direct)
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 1969, in gather
[ERROR] asynchronous=asynchronous,
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 838, in sync
[ERROR] self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/utils.py", line 351, in sync
[ERROR] raise exc.with_traceback(tb)
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/utils.py", line 334, in f
[ERROR] result[0] = yield future
[ERROR] File "/usr/local/lib/python3.7/site-packages/tornado/gen.py", line 762, in run
[ERROR] value = future.result()
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 1828, in _gather
[ERROR] raise exception.with_traceback(traceback)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/persistence.py", line 54, in _arrow_write_parquet
[ERROR] pq.write_table(tbl, target_path, filesystem=filesystem, **pa_options)
[ERROR] File "/usr/local/lib/python3.7/site-packages/pyarrow/parquet.py", line 1797, in write_table
[ERROR] **kwargs) as writer:
[ERROR] File "/usr/local/lib/python3.7/site-packages/pyarrow/parquet.py", line 609, in __init__
[ERROR] path, compression=None)
[ERROR] File "pyarrow/_fs.pyx", line 660, in pyarrow._fs.FileSystem.open_output_stream
[ERROR] out_handle = GetResultValue(self.fs.OpenOutputStream(pathstr))
[ERROR] File "pyarrow/error.pxi", line 122, in pyarrow.lib.pyarrow_internal_check_status
[ERROR] return check_status(status)
[ERROR] File "pyarrow/_fs.pyx", line 1072, in pyarrow._fs._cb_open_output_stream
[ERROR] stream = handler.open_output_stream(frombytes(path))
[ERROR] File "/usr/local/lib/python3.7/site-packages/pyarrow/fs.py", line 314, in open_output_stream
[ERROR] return PythonFile(self.fs.open(path, mode="wb"), mode="w")
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypernets/utils/_fsutils.py", line 105, in execute
[ERROR] result = fn(self.to_rpath(rpath), *args, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/fsspec/spec.py", line 943, in open
[ERROR] **kwargs,
[ERROR] File "/usr/local/lib/python3.7/site-packages/fsspec/implementations/local.py", line 118, in _open
[ERROR] return LocalFileOpener(path, mode, fs=self, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/fsspec/implementations/local.py", line 200, in __init__
[ERROR] self._open()
[ERROR] File "/usr/local/lib/python3.7/site-packages/fsspec/implementations/local.py", line 205, in _open
[ERROR] self.f = open(self.path, mode=self.mode)
[ERROR]
And sometimes results in another error:
[ERROR] 07-14 16:24:18 E hypernets.e._experiment.py 85 - ExperiementID:[None] - evaluate feature importance:
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypernets/experiment/_experiment.py", line 75, in run
[ERROR] y_eval=self.y_eval, eval_size=self.eval_size, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/experiment.py", line 1116, in train
[ERROR] return super().train(hyper_model, X_train, y_train, X_test, X_eval, y_eval, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/experiment.py", line 839, in train
[ERROR] step.fit_transform(hyper_model, X_train, y_train, X_test=X_test, X_eval=X_eval, y_eval=y_eval, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/experiment.py", line 431, in fit_transform
[ERROR] importances = feature_importance_batch(estimators, X_eval, y_eval, self.scorer, n_repeats=5)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/feature_importance.py", line 73, in feature_importance_batch
[ERROR] random_state=random_state)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/dask_ex.py", line 356, in permutation_importance
[ERROR] col_scores.append(scorer(estimator, X_permuted, y))
[ERROR] File "/usr/local/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 170, in __call__
[ERROR] sample_weight=sample_weight)
[ERROR] File "/usr/local/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 247, in _score
[ERROR] y_pred = method_caller(clf, "predict_proba", X)
[ERROR] File "/usr/local/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 53, in _cached_call
[ERROR] return getattr(estimator, method)(*args, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/dask_ex.py", line 274, in call_and_compute
[ERROR] r = fn_call(*args, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/hyper_gbm.py", line 483, in predict_proba
[ERROR] proba = getattr(self.gbm_model, method)(X)
[ERROR] File "/usr/local/lib/python3.7/site-packages/hypergbm/estimators.py", line 382, in predict_proba
[ERROR] proba = dex.fix_binary_predict_proba_result(proba)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/dask_ex.py", line 261, in fix_binary_predict_proba_result
[ERROR] proba = make_chunk_size_known(proba)
[ERROR] File "/usr/local/lib/python3.7/site-packages/tabular_toolbox/dask_ex.py", line 142, in make_chunk_size_known
[ERROR] a = a.compute_chunk_sizes()
[ERROR] File "/usr/local/lib/python3.7/site-packages/dask/array/core.py", line 1274, in compute_chunk_sizes
[ERROR] [tuple([int(chunk) for chunk in chunks]) for chunks in compute(tuple(c))[0]]
[ERROR] File "/usr/local/lib/python3.7/site-packages/dask/base.py", line 565, in compute
[ERROR] results = schedule(dsk, keys, **kwargs)
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 2654, in get
[ERROR] results = self.gather(packed, asynchronous=asynchronous, direct=direct)
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 1969, in gather
[ERROR] asynchronous=asynchronous,
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 838, in sync
[ERROR] self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
[ERROR] File "/usr/local/lib/python3.7/site-packages/distributed/utils.py", line 351, in sync
[ERROR] raise exc.with_traceback(tb)
This first error is logged by hyper_gbm.py
:
def _save_df(self, filepath, df):
try:
# with fs.open(filepath, 'wb') as f:
# df.to_parquet(f)
if not isinstance(df, pd.DataFrame):
fs.mkdirs(filepath, exist_ok=True)
to_parquet(df, filepath, fs)
except Exception as e:
logger.error(e)
# traceback.print_exc()
if fs.exists(filepath):
fs.rm(filepath, recursive=True)
I can see before the invocation of to_parquet
, the filepath
is already created by fs
. Here I'm confused about:
-
While the default cache_dir is hypergbm_cache
, where does the prefix /tmp/workdir/
come from? The only location related with this /tmp/workdir/
is hypernets\utils\_fsutils.py
:
if type(fs).__name__.lower().find('local') >= 0:
if fs_root is None or fs_root == '':
fs_root = os.path.join(tempfile.gettempdir(), 'workdir')
-
Is the path create by fs
the same as the path in function to_parquet
where there are also some operations related with the file system.
-
In jupyter, the error disappears?
Then comes to the mechanism of cache:
The use_cache
option cannot control the cache behavior of hypergbm.hyper_gbm.HyperGBMEstimator.predict
, hypergbm.hyper_gbm.HyperGBMEstimator.predict_proba
in steps such as hypergbm.experiment.PermutationImportanceSelectionStep
or hypergbm.experiment.EnsembleStep
, where the HyperGBMEstimator
is loaded from training trails and the predict method is invoked with use_cache=None
, then in hypergbm.hyper_gbm.HyperGBMEstimator.transform_data
:
def transform_data(self, X, y=None, fit=False, use_cache=None, verbose=0):
if use_cache is None:
use_cache = False
results in the action of saving intermediate data.
To avoid this, I think the easiest way may be:
Change use_cache
to False
when it's None
in HyperGBMEstimator's transform_data
.
Or, fix the to_parquet
.
Expect for the fix.
bug