Error: Found array with 0 feature(s)

Krkaufma · August 26, 2020, 5:57pm

I am using automatminer from the express preset.

During the run, I frequently see

_pre_test decorator: _random_mutation_operator: num_test=1 Found array with 0 feature(s) (shape=(22, 0)) while a minimum of 1 is required by RobustScaler…

ultimately leading to

ValueError: Found array with 0 feature(s) (shape=(25, 0)) while a minimum of 1 is required by RobustScaler.

This error does not only occur with RobustScaler, I see it with the other sklearn methods as well.

Here is the log leading up to tpot showing there should be features available:

2020-08-26 10:39:15 INFO DataCleaner: After handling na: 25 samples, 4147 features
2020-08-26 10:39:15 INFO DataCleaner: Finished fitting.
2020-08-26 10:39:15 INFO FeatureReducer: Starting fitting.
2020-08-26 10:39:38 INFO FeatureReducer: 1385 features removed due to cross correlation more than 0.95
2020-08-26 10:39:50 INFO TreeFeatureReducer: Finished tree-based feature reduction of 2761 initial features to 78
2020-08-26 10:39:50 INFO FeatureReducer: Finished fitting.
2020-08-26 10:39:50 INFO FeatureReducer: Starting transforming.
2020-08-26 10:39:50 INFO FeatureReducer: Finished transforming.
2020-08-26 10:39:50 INFO TPOTAdaptor: Starting fitting.
27 operators have been imported by TPOT.

There are also pareto front scores shown during the process:

Generation 1 - Current Pareto front scores:
-3 -64.88276749444293 RidgeCV(ZeroCount(SelectFwe(input_matrix, SelectFwe__alpha=0.03)))
…

Generation 10 - Current Pareto front scores:
-3 -44.85358622288145 RidgeCV(RobustScaler(VarianceThreshold(input_matrix, VarianceThreshold__threshold=0.01)))
…

Full stack trace:

ValueError Traceback (most recent call last)
in
1 # Create Matpipe in ‘express’ mode for recommended settings
2 pipe = MatPipe.from_preset(preset=“express”, n_jobs=22)
----> 3 pipe.fit(df=train_df, target=target_name)

~/.local/lib/python3.6/site-packages/automatminer/utils/pkg.py in wrapper(*args, **kwargs)
102 def wrapper(*args, **kwargs):
103 args[0].is_fit = False
→ 104 result = func(*args, **kwargs)
105 args[0].is_fit = True
106 return result

~/.local/lib/python3.6/site-packages/automatminer/pipeline.py in fit(self, df, target)
182 df = self.cleaner.fit_transform(df, target)
183 df = self.reducer.fit_transform(df, target)
→ 184 self.learner.fit(df, target)
185 logger.info(“MatPipe successfully fit.”)
186 self.post_fit_df = df

~/.local/lib/python3.6/site-packages/automatminer/utils/log.py in wrapper(*args, **kwargs)
94 self = args[0]
95 logger.info(“{}Starting {}.”.format(self._log_prefix, operation))
—> 96 result = meth(*args, **kwargs)
97 logger.info(“{}Finished {}.”.format(self._log_prefix, operation))
98 return result

~/.local/lib/python3.6/site-packages/automatminer/utils/pkg.py in wrapper(*args, **kwargs)
102 def wrapper(*args, **kwargs):
103 args[0].is_fit = False
→ 104 result = func(*args, **kwargs)
105 args[0].is_fit = True
106 return result

~/.local/lib/python3.6/site-packages/automatminer/automl/adaptors.py in fit(self, df, target, **fit_kwargs)
135 self._features = df.drop(columns=target).columns.tolist()
136 self._fitted_target = target
→ 137 self._backend = self._backend.fit(X, y, **fit_kwargs)
138 return self
139

~/.local/lib/python3.6/site-packages/tpot/base.py in fit(self, features, target, sample_weight, groups)
744 # raise the exception if it’s our last attempt
745 if attempt == (attempts - 1):
→ 746 raise e
747 return self
748

~/.local/lib/python3.6/site-packages/tpot/base.py in fit(self, features, target, sample_weight, groups)
736
737 self._update_top_pipeline()
→ 738 self._summary_of_best_pipeline(features, target)
739 # Delete the temporary cache before exiting
740 self._cleanup_memory()

~/.local/lib/python3.6/site-packages/tpot/base.py in summary_of_best_pipeline(self, features, target)
860 with warnings.catch_warnings():
861 warnings.simplefilter(‘ignore’)
→ 862 self.pareto_front_fitted_pipelines[str(pipeline)].fit(features, target)
863
864 def predict(self, features):

~/.local/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
348 This estimator
349 “”"
→ 350 Xt, fit_params = self._fit(X, y, **fit_params)
351 with _print_elapsed_time(‘Pipeline’,
352 self._log_message(len(self.steps) - 1)):

~/.local/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
313 message_clsname=‘Pipeline’,
314 message=self._log_message(step_idx),
→ 315 **fit_params_steps[name])
316 # Replace the transformer of the step with the fitted
317 # transformer. This is necessary when loading the transformer

~/.local/lib/python3.6/site-packages/joblib/memory.py in call(self, *args, **kwargs)
563
564 def call(self, *args, **kwargs):
→ 565 return self._cached_call(args, kwargs)[0]
566
567 def getstate(self):

~/.local/lib/python3.6/site-packages/joblib/memory.py in _cached_call(self, args, kwargs, shelving)
529
530 if must_call:
→ 531 out, metadata = self.call(*args, **kwargs)
532 if self.mmap_mode is not None:
533 # Memmap the output at the first call to be consistent with

~/.local/lib/python3.6/site-packages/joblib/memory.py in call(self, *args, **kwargs)
725 if self._verbose > 0:
726 print(format_call(self.func, args, kwargs))
→ 727 output = self.func(*args, **kwargs)
728 self.store_backend.dump_item(
729 [func_id, args_id], output, verbose=self._verbose)

~/.local/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, ‘fit_transform’):
→ 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)

~/.local/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
→ 574 return self.fit(X, y, **fit_params).transform(X)
575
576

~/.local/lib/python3.6/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
1198 # the quantiles
1199 X = check_array(X, accept_sparse=‘csc’, estimator=self,
→ 1200 dtype=FLOAT_DTYPES, force_all_finite=‘allow-nan’)
1201
1202 q_min, q_max = self.quantile_range

~/.local/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
592 " a minimum of %d is required%s."
593 % (n_features, array.shape, ensure_min_features,
→ 594 context))
595
596 if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:

ValueError: Found array with 0 feature(s) (shape=(25, 0)) while a minimum of 1 is required by RobustScaler.

ardunn · August 27, 2020, 2:02am

Hey there,

Can you post some info for me?

Your OS, python version, and whether you are using conda or virtualenv or neither
The output of pip freeze (from within you virtualenv if using one)
The full automatminer.log of one of these failed runs

Krkaufma · August 27, 2020, 2:15am

Hi Alex,

I am using a conda venv and my software versions are as follows:
Ubuntu 18.04
Python 3.6.8

Here is everything from pip freeze for this virtual environment.

absl-py 0.8.1
alabaster 0.7.12
argon2-cffi 20.1.0
astor 0.8.0
attrs 19.3.0
automatminer 1.0.3.20200727
Babel 2.7.0
backcall 0.2.0
bleach 3.1.5
certifi 2020.6.20
cffi 1.14.2
chardet 3.0.4
cycler 0.10.0
dataclasses 0.7
deap 1.3.1
decorator 4.4.2
defusedxml 0.6.0
docutils 0.15.2
entrypoints 0.3
et-xmlfile 1.0.1
future 0.18.2
google-pasta 0.1.8
graphviz 0.12
grpcio 1.25.0
idna 2.10
imagecodecs-lite 2019.12.3
imageio 2.6.1
imagesize 1.1.0
importlib-metadata 1.7.0
ipykernel 5.3.4
ipython 7.16.1
ipython-genutils 0.2.0
ipywidgets 7.5.1
jdcal 1.4.1
jedi 0.17.2
Jinja2 2.10.3
joblib 0.16.0
jsonschema 3.2.0
jupyter-client 6.1.6
jupyter-core 4.6.3
Keras 2.3.1
Keras-Applications 1.0.8
Keras-Preprocessing 1.1.0
kiwisolver 1.2.0
Markdown 3.1.1
MarkupSafe 1.1.1
matminer 0.6.2
matplotlib 3.3.1
mistune 0.8.4
monty 3.0.4
more-itertools 8.0.0
mpmath 1.1.0
nbconvert 5.6.1
nbformat 5.0.7
networkx 2.5
notebook 6.1.3
numpy 1.19.1
openpyxl 3.0.2
packaging 20.4
palettable 3.3.0
pandas 1.1.1
pandocfilters 1.4.2
parso 0.7.1
pexpect 4.8.0
pickleshare 0.7.5
Pillow 7.2.0
Pint 0.15
pip 18.1
plotly 4.9.0
pluggy 0.13.1
prometheus-client 0.8.0
prompt-toolkit 3.0.6
protobuf 3.11.1
ptyprocess 0.6.0
py 1.8.0
pycparser 2.20
PyDispatcher 2.0.5
pydot 1.4.1
Pygments 2.5.2
pymatgen 2020.1.28
pymongo 3.11.0
pyparsing 2.4.7
pyrsistent 0.16.0
pytest 5.3.1
python-dateutil 2.8.1
pytz 2020.1
PyWavelets 1.1.1
PyYAML 5.1.2
pyzmq 19.0.2
requests 2.24.0
retrying 1.3.3
rfpimp 1.3.2
ruamel.yaml 0.16.10
ruamel.yaml.clib 0.2.0
scikit-image 0.16.2
scikit-learn 0.22.2
scipy 1.5.2
seaborn 0.9.0
Send2Trash 1.5.0
setuptools 49.6.0
six 1.15.0
sklearn 0.0
skrebate 0.6
snowballstemmer 2.0.0
spglib 1.16.0
Sphinx 2.2.2
sphinxcontrib-applehelp 1.0.1
sphinxcontrib-devhelp 1.0.1
sphinxcontrib-htmlhelp 1.0.2
sphinxcontrib-jsmath 1.0.1
sphinxcontrib-qthelp 1.0.2
sphinxcontrib-serializinghtml 1.1.3
stopit 1.1.2
sympy 1.6.2
tabulate 0.8.7
termcolor 1.1.0
terminado 0.8.3
testpath 0.4.4
tifffile 2019.7.26.2
tornado 6.0.4
TPOT 0.11.0
tqdm 4.48.2
traitlets 4.3.3
update-checker 0.18.0
urllib3 1.25.10
wcwidth 0.1.7
webencodings 0.5.1
Werkzeug 0.16.0
wheel 0.33.6
widgetsnbextension 3.5.1
wrapt 1.11.2
xlrd 1.2.0
zipp 3.1.0

The full automatminer.log:

2020-08-26 10:09:44 INFO 2020-08-26 10:09:44 INFO 2020-08-26 10:09:44 INFO 2020-08-26 10:09:44 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:45 INFO 2020-08-26 10:09:46 INFO 2020-08-26 10:09:46 INFO 2020-08-26 10:09:46 INFO 2020-08-26 10:10:09 INFO 2020-08-26 10:10:15 INFO 2020-08-26 10:10:15 INFO 2020-08-26 10:10:15 INFO 2020-08-26 10:10:15 INFO 2020-08-26 10:10:15 INFO 2020-08-26 10:39:12 INFO 2020-08-26 10:39:12 INFO 2020-08-26 10:39:12 INFO 2020-08-26 10:39:12 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:13 INFO 2020-08-26 10:39:14 INFO 2020-08-26 10:39:14 INFO 2020-08-26 10:39:14 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:15 INFO 2020-08-26 10:39:38 INFO 2020-08-26 10:39:50 INFO 2020-08-26 10:39:50 INFO 2020-08-26 10:39:50 INFO 2020-08-26 10:39:50 INFO 2020-08-26 10:39:50 INFO Problem type is: regression
Fitting MatPipe pipeline to data.
AutoFeaturizer: Starting fitting.
AutoFeaturizer: Guessing oxidation states of compositions, as they were not present in input.
AutoFeaturizer: Featurizer type structure not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Featurizer type bandstructure not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Featurizer type dos not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Finished fitting.
AutoFeaturizer: Starting transforming.
AutoFeaturizer: Featurizing with ElementProperty.
AutoFeaturizer: Featurizer type structure not in the dataframe. Skipping…
AutoFeaturizer: Featurizer type bandstructure not in the dataframe. Skipping…
AutoFeaturizer: Featurizer type dos not in the dataframe. Skipping…
AutoFeaturizer: Finished transforming.
DataCleaner: Starting fitting.
DataCleaner: Cleaning with respect to samples with sample na_method ‘drop’
DataCleaner: Replacing infinite values with nan for easier screening.
DataCleaner: Before handling na: 25 samples, 4142 features
DataCleaner: 0 samples did not have target values. They were dropped.
DataCleaner: Handling feature na by max na threshold of 0.01 with method ‘drop’.
DataCleaner: After handling na: 25 samples, 4142 features
DataCleaner: Finished fitting.
FeatureReducer: Starting fitting.
FeatureReducer: 1382 features removed due to cross correlation more than 0.95
TreeFeatureReducer: Finished tree-based feature reduction of 2759 initial features to 24
FeatureReducer: Finished fitting.
FeatureReducer: Starting transforming.
FeatureReducer: Finished transforming.
TPOTAdaptor: Starting fitting.
Problem type is: regression
Fitting MatPipe pipeline to data.
AutoFeaturizer: Starting fitting.
AutoFeaturizer: Guessing oxidation states of compositions, as they were not present in input.
AutoFeaturizer: Featurizer type structure not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Featurizer type bandstructure not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Featurizer type dos not in the dataframe to be fitted. Skipping…
AutoFeaturizer: Finished fitting.
AutoFeaturizer: Starting transforming.
AutoFeaturizer: Featurizing with ElementProperty.
AutoFeaturizer: Featurizing with OxidationStates.
AutoFeaturizer: Featurizing with ElectronAffinity.
AutoFeaturizer: Featurizing with IonProperty.
AutoFeaturizer: Featurizing with YangSolidSolution.
AutoFeaturizer: Featurizing with Miedema.
AutoFeaturizer: Featurizer type structure not in the dataframe. Skipping…
AutoFeaturizer: Featurizer type bandstructure not in the dataframe. Skipping…
AutoFeaturizer: Featurizer type dos not in the dataframe. Skipping…
AutoFeaturizer: Finished transforming.
DataCleaner: Starting fitting.
DataCleaner: Cleaning with respect to samples with sample na_method ‘drop’
DataCleaner: Replacing infinite values with nan for easier screening.
DataCleaner: Before handling na: 25 samples, 4155 features
DataCleaner: 0 samples did not have target values. They were dropped.
DataCleaner: Handling feature na by max na threshold of 0.01 with method ‘drop’.
DataCleaner: These 8 features were removed as they had more than 1.0% missing values: {‘std_dev oxidation state’, ‘maximum oxidation state’, ‘avg anion electron affinity’, ‘minimum oxidation state’, ‘range oxidation state’, ‘compound possible’, ‘max ionic char’, ‘avg ionic char’}
DataCleaner: After handling na: 25 samples, 4147 features
DataCleaner: Finished fitting.
FeatureReducer: Starting fitting.
FeatureReducer: 1385 features removed due to cross correlation more than 0.95
TreeFeatureReducer: Finished tree-based feature reduction of 2761 initial features to 78
FeatureReducer: Finished fitting.
FeatureReducer: Starting transforming.
FeatureReducer: Finished transforming.
TPOTAdaptor: Starting fitting.

Krkaufma · August 27, 2020, 4:41pm

Quick update on some attempted debugging:
I (just now) started from a fresh conda venv running Python 3.7.1 and only installed automatminer and its dependencies using pip to see if this would fix it.

I still see the same errors (array with 0 features and bad combinations of args within tpot/sklearn) and now I also keep running into dead kernels and crashing the computer when running MatPipe.fit in debug or express mode with n_jobs=1. I am running the commands inside a jupyter notebook.

pip freeze:

argon2-cffi==20.1.0
attrs==20.1.0
automatminer==1.0.3.20200727
backcall==0.2.0
bleach==3.1.5
certifi==2020.6.20
cffi==1.14.2
chardet==3.0.4
cycler==0.10.0
deap==1.3.1
decorator==4.4.2
defusedxml==0.6.0
entrypoints==0.3
future==0.18.2
idna==2.10
importlib-metadata==1.7.0
ipykernel==5.3.4
ipython==7.17.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.2
Jinja2==2.11.2
joblib==0.16.0
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.7
jupyter-console==6.1.0
jupyter-core==4.6.3
kiwisolver==1.2.0
MarkupSafe==1.1.1
matminer==0.6.2
matplotlib==3.3.1
mistune==0.8.4
monty==3.0.4
mpmath==1.1.0
nbconvert==5.6.1
nbformat==5.0.7
networkx==2.5
notebook==6.1.3
numpy==1.19.1
packaging==20.4
palettable==3.3.0
pandas==1.1.1
pandocfilters==1.4.2
parso==0.7.1
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.2.0
Pint==0.15
plotly==4.9.0
prometheus-client==0.8.0
prompt-toolkit==3.0.6
ptyprocess==0.6.0
pycparser==2.20
PyDispatcher==2.0.5
Pygments==2.6.1
pymatgen==2020.1.28
pymongo==3.11.0
pyparsing==2.4.7
pyrsistent==0.16.0
python-dateutil==2.8.1
pytz==2020.1
PyYAML==5.1.2
pyzmq==19.0.2
qtconsole==4.7.6
QtPy==1.9.0
requests==2.24.0
retrying==1.3.3
ruamel.yaml==0.16.10
ruamel.yaml.clib==0.2.0
scikit-learn==0.22.2
scipy==1.5.2
Send2Trash==1.5.0
six==1.15.0
skrebate==0.6
spglib==1.16.0
stopit==1.1.2
sympy==1.6.2
tabulate==0.8.7
terminado==0.8.3
testpath==0.4.4
tornado==6.0.4
TPOT==0.11.0
tqdm==4.48.2
traitlets==4.3.3
update-checker==0.18.0
urllib3==1.25.10
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==3.5.1
xlrd==1.2.0
zipp==3.1.0

ardunn · August 28, 2020, 9:10pm

Hey there,

Thanks for the info. Would you mind opening an issue on the repo? I don’t think this has to do with your configuration and is instead some problem with the code that needs to be solved.

Thanks,
Alex

Krkaufma · August 31, 2020, 5:34pm

Hey Alex,

Yes, I will open an issue on the repo.

Thanks,
Kevin

Jorge_Alonso_Delgado · September 1, 2020, 9:37am

Hi @Krkaufma, @ardunn and community,

I’ve been experiencing a very similar issue to that described by @Krkaufma since a month ago when I started working with automatminer:

When running a “express” preset in automatminer, I got many “Found array with 0 feature(s) (shape=(50, 0))…” which end up in a final error like:

…
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…

60.14 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.

TPOT closed prematurely. Will use the current best pipeline.

ValueError Traceback (most recent call last)
|timed exec| in |module|
…
ValueError: Found array with 0 feature(s) (shape=(360, 0)) while a minimum of 1 is required.

Note: 360 is the size of the training data for each fold.

In my case, I don’t have issues of dead kernels or computer crashes, just the error stated above.
I’ve got a few more observations which might help to trace the error:

Running “express_single” preset, always finishes without showing any errors. Possibly because only rfr regressor is computed without ML-preprocessing steps, like data scaling.
Runnign “debug” preset, is completed 1 out of 4 times the calculation is launched. In all the cases, the error appears hundreds or thousands of times (a variable, and random number of times), and finally, sometimes the pipe is fitted and sometimes not and instead the final error appears. I’ve observed that when the error appear too many times (let’s say, thousands), it is almost sure that the run crashes, but when there are not too many of these errors (a few dozens) it is likely the pipe if finally fitted. Here is one of the many messages related:

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by StandardScaler…

Running a “express” preset, behaves similar than the observed with “debug”. Sometimes it is finishes, sometimes it doesn’t. The difference is the time needed to detect the crash or not.
The features I’m working with, most are automatically generated by AutoFeaturizer (composition).

Due to the common appearance of:
Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by StandardScaler… (or any other scaler)
I was thinking that these messages might be derived of issues when trying to apply scalers to the features (commonly associated to linear ML-models).

I was wondering also if the error was strictly related to matminer, automatminer, or tpot?. Due to the random behavior of this error (the pipe sometimes manage to finish the fitting even when having many of these messages), I was thinking that could be related to tpot (due to its stochastic nature), but it might be something else.
Might be a matter of compatibility of versions of any of the involved libraries?
The characteristics of my data?
Just wanted to share this info before of opening an issue on the repo (may be @Krkaufma is on the way of doing so already). Maybe I’m missing something very basic.

Any light about this issue would be very appreciated.

Here is my system info:
• OS: Ubuntu 18.04.3 LTS (as a virtual machine in Windows 10 v 1909 as the host)
• Python version: 3.6.10 (h7579374_2) installed initially from Anaconda on a new environment. The environment was initially setup with the configuration of the MaterialsProject Workshop 2020 as stated in the files provided in the github repository.
• jupyter version: jupyter-notebook : 6.0.3 (ipython: 7.16.1)
• pip freeze:

ase==3.19.2
atomate==0.9.5
attrs==19.3.0
Automat==20.2.0
automatminer==1.0.3.20200727
backcall==0.2.0
bleach==3.1.5
boltons==20.2.0
bravado==10.6.2
bravado-core==5.17.0
Brotli==1.0.7
brotlipy==0.7.0
certifi==2020.6.20
cffi==1.14.0
chardet==3.0.4
click==7.1.2
constantly==15.1.0
crochet==1.12.0
cryptography==2.9.2
crystal-toolkit==2020.6.4
custodian==2020.4.27
cycler==0.10.0
dash==1.14.0
dash-core-components==1.10.2
dash-daq==0.4.0
dash-html-components==1.0.3
dash-mp-components==0.0.24
dash-renderer==1.6.0
dash-table==4.9.0
dataclasses==0.7
deap==1.3.1
decorator==4.4.2
defusedxml==0.6.0
dnspython==2.0.0
entrypoints==0.3
fido==4.2.2
FireWorks==1.9.6
Flask==1.1.2
Flask-Caching==1.9.0
Flask-Compress==1.5.0
flask-paginate==0.7.0
future==0.18.2
gevent==1.5.0
graphviz==0.14.1
greenlet==0.4.16
gunicorn==20.0.4
h5py==2.10.0
habanero==0.7.4
hiphive==0.7.1
htmlmin==0.1.12
hyperlink==20.0.0
idna @ file:///tmp/build/80754af9/idna_1593446292537/work
imageio==2.9.0
importlib-metadata @ file:///tmp/build/80754af9/importlib-metadata_1593446433964/work
incremental==17.5.0
inflect==4.1.0
ipykernel @ file:///tmp/build/80754af9/ipykernel_1596206602906/work/dist/ipykernel-5.3.4-py3-none-any.whl
ipython @ file:///tmp/build/80754af9/ipython_1593447367857/work
ipython-genutils==0.2.0
ipywidgets==7.5.1
itsdangerous==2.0.0a1
jedi @ file:///tmp/build/80754af9/jedi_1592841914522/work
Jinja2==2.11.2
joblib==0.16.0
jsmin==2.2.2
json2html==1.3.0
json5==0.9.5
jsonpointer==2.0
jsonref==0.2
jsonschema==3.2.0
jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work
jupyter-core==4.6.3
jupyterlab==2.1.5
jupyterlab-server @ file:///tmp/build/80754af9/jupyterlab_server_1594164409481/work
kiwisolver==1.2.0
latexcodec==2.0.1
livereload==2.6.2
llvmlite==0.33.0
lunr==0.5.8
Markdown==3.2.2
MarkupSafe==1.1.1
matminer==0.6.2
matplotlib==3.3.0
mistune==0.8.4
mkdocs==1.1.2
mkdocs-material==5.5.0
mkdocs-material-extensions==1.0
mkdocs-minify-plugin==0.3.0
mknotebooks==0.4.1
monotonic==1.5
monty==3.0.4
-e git+https://github.com/materialsproject/workshop.git@3b09665891608b3f167fe3a9fbb3c5c20e711637#egg=mp_workshop
mpcontribs-client==3.3.0
mpmath==1.1.0
msgpack==1.0.0
msgpack-python==0.5.6
nb-conda-kernels @ file:///home/conda/feedstock_root/build_artifacts/nb_conda_kernels_1596524026459/work
nbconvert==5.6.1
nbformat==5.0.7
networkx==2.4
nltk==3.5
notebook==6.0.3
numba==0.50.1
numpy==1.19.1
packaging==20.4
palettable==3.3.0
pandas==1.1.0
pandocfilters==1.4.2
parso==0.7.0
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.2.0
Pint==0.14
plotly==4.9.0
prometheus-client==0.8.0
prompt-toolkit==3.0.5
ptyprocess==0.6.0
PubChemPy==1.0.4
pyasn1==0.4.8
pyasn1-modules==0.2.8
pybtex==0.22.2
pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
pydantic==1.6.1
pydash==4.8.0
PyDispatcher==2.0.5
Pygments==2.6.1
PyHamcrest==2.0.2
pyIsEmail==1.3.2
pymatgen==2020.1.28
pymatgen-diffusion==2019.8.18
pymdown-extensions==7.1
pymongo==3.11.0
pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1594392929924/work
pyparsing==2.4.7
pyrsistent==0.16.0
PySocks==1.7.1
python-dateutil==2.8.1
pytz==2020.1
PyWavelets==1.1.1
PyYAML==5.1.2
pyzmq==19.0.2
redis==3.5.3
regex==2020.7.14
requests @ file:///tmp/build/80754af9/requests_1592841827918/work
requests-futures==1.0.0
retrying==1.3.3
rfc3987==1.3.8
robocrys==0.2.4
ruamel.yaml==0.16.10
ruamel.yaml.clib==0.2.0
scikit-image==0.17.2
scikit-learn==0.22.2
scipy==1.5.2
Send2Trash==1.5.0
sentry-sdk==0.16.3
service-identity==18.1.0
simplejson==3.17.2
six==1.15.0
skrebate==0.6
spglib==1.16.0
stopit==1.1.2
strict-rfc3339==0.7
swagger-spec-validator==2.7.3
sympy==1.6.1
tabulate==0.8.7
terminado==0.8.3
testpath==0.4.4
tifffile==2020.7.24
tornado==6.0.4
TPOT==0.11.0
tqdm==4.48.2
traitlets==4.3.3
Twisted==20.3.0
typing==3.7.4.3
typing-extensions==3.7.4.2
update-checker==0.18.0
urllib3==1.25.9
wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
webcolors==1.11.1
webencodings==0.5.1
Werkzeug==1.0.1
widgetsnbextension==3.5.1
wrapt==1.12.1
yelp-bytes==0.3.0
yelp-encodings==0.1.3
zipp==3.1.0
zope.interface==5.1.0

• a full automatminer.log (the relevant):

When creating and modifying the associated preset pipeline:

from automatminer import get_preset_config, FeatureReducer, MatPipe
config = get_preset_config(“express”)
config[“reducer”] = FeatureReducer(reducers=(“corr”,))

from automatminer.pipeline import MatPipe
pipe = MatPipe(**config)

output:

/home/jorge/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.scorer module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
warnings.warn(message, FutureWarning)
/home/jorge/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.feature_selection.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.
warnings.warn(message, FutureWarning)
/home/jorge/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning:

The sklearn.neighbors.unsupervised module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.

/home/jorge/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/externals/joblib/init.py:15: FutureWarning:

sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.

Then for fitting the pipeline:

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1)
predicted_folds = pipe.benchmark(df, target, kf)

output:

2020-09-01 12:19:08 WARNING Beginning benchmark.
2020-09-01 12:19:08 INFO Training on fold index 0
2020-09-01 12:19:08 INFO Problem type is: regression
2020-09-01 12:19:08 INFO Fitting MatPipe pipeline to data.
2020-09-01 12:19:08 INFO AutoFeaturizer: Starting fitting.
2020-09-01 12:19:08 INFO AutoFeaturizer: Compositions detected as strings. Attempting conversion to Composition objects…
HBox(children=(FloatProgress(value=0.0, description=‘StrToComposition’, max=360.0, style=ProgressStyle(descrip…

2020-09-01 12:19:08 INFO AutoFeaturizer: Guessing oxidation states of compositions, as they were not present in input.
HBox(children=(FloatProgress(value=0.0, description=‘CompositionToOxidComposition’, max=360.0, style=ProgressS…

2020-09-01 12:19:10 INFO AutoFeaturizer: Featurizer type structure not in the dataframe to be fitted. Skipping…
2020-09-01 12:19:10 INFO AutoFeaturizer: Featurizer type bandstructure not in the dataframe to be fitted. Skipping…
2020-09-01 12:19:10 INFO AutoFeaturizer: Featurizer type dos not in the dataframe to be fitted. Skipping…
2020-09-01 12:19:10 INFO AutoFeaturizer: Finished fitting.
2020-09-01 12:19:10 INFO AutoFeaturizer: Starting transforming.
2020-09-01 12:19:10 INFO AutoFeaturizer: Featurizing with ElementProperty.
HBox(children=(FloatProgress(value=0.0, description=‘ElementProperty’, max=360.0, style=ProgressStyle(descript…

2020-09-01 12:19:15 INFO AutoFeaturizer: Featurizing with OxidationStates.
HBox(children=(FloatProgress(value=0.0, description=‘OxidationStates’, max=360.0, style=ProgressStyle(descript…

…

2020-09-01 12:19:28 INFO AutoFeaturizer: Featurizer type structure not in the dataframe. Skipping…
2020-09-01 12:19:28 INFO AutoFeaturizer: Featurizer type bandstructure not in the dataframe. Skipping…
2020-09-01 12:19:28 INFO AutoFeaturizer: Featurizer type dos not in the dataframe. Skipping…
2020-09-01 12:19:28 INFO AutoFeaturizer: Finished transforming.
2020-09-01 12:19:28 INFO DataCleaner: Starting fitting.
2020-09-01 12:19:28 INFO DataCleaner: Cleaning with respect to samples with sample na_method ‘drop’
2020-09-01 12:19:28 INFO DataCleaner: Replacing infinite values with nan for easier screening.
2020-09-01 12:19:28 INFO DataCleaner: Before handling na: 360 samples, 148 features
2020-09-01 12:19:28 INFO DataCleaner: 0 samples did not have target values. They were dropped.
2020-09-01 12:19:28 INFO DataCleaner: Handling feature na by max na threshold of 0.01 with method ‘drop’.
2020-09-01 12:19:28 INFO DataCleaner: These 1 features were removed as they had more than 1.0% missing values: {‘avg anion electron affinity’}
2020-09-01 12:19:29 INFO DataCleaner: After handling na: 360 samples, 147 features
2020-09-01 12:19:29 INFO DataCleaner: Finished fitting.
2020-09-01 12:19:29 INFO FeatureReducer: Starting fitting.
2020-09-01 12:19:29 INFO FeatureReducer: 89 features removed due to cross correlation more than 0.95
2020-09-01 12:19:29 INFO FeatureReducer: Finished fitting.
2020-09-01 12:19:29 INFO FeatureReducer: Starting transforming.
2020-09-01 12:19:29 INFO FeatureReducer: Finished transforming.
2020-09-01 12:19:29 INFO TPOTAdaptor: Starting fitting.
Version 0.11.0 of tpot is outdated. Version 0.11.5 was released Monday June 01, 2020.
27 operators have been imported by TPOT.
HBox(children=(FloatProgress(value=0.0, description=‘Optimization Progress’, max=20.0, style=ProgressStyle(des…
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
Generation 1 - Current Pareto front scores:
-3 nan KNeighborsRegressor(Binarizer(SelectFromModel(input_matrix, SelectFromModel__ExtraTreesRegressor__max_features=0.7500000000000002, SelectFromModel__ExtraTreesRegressor__n_estimators=100, SelectFromModel__threshold=0.2), Binarizer__threshold=0.55), KNeighborsRegressor__n_neighbors=5, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=uniform)
-3 -6.2428028442977705 KNeighborsRegressor(PolynomialFeatures(SelectPercentile(input_matrix, SelectPercentile__percentile=94), PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=False, PolynomialFeatures__interaction_only=False), KNeighborsRegressor__n_neighbors=86, KNeighborsRegressor__p=2, KNeighborsRegressor__weights=uniform)

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
Generation 2 - Current Pareto front scores:
…
…
Generation 6 - Current Pareto front scores:
…
-3 nan KNeighborsRegressor(Binarizer(SelectFromModel(input_matrix, SelectFromModel__ExtraTreesRegressor__max_features=0.7500000000000002, SelectFromModel__ExtraTreesRegressor__n_estimators=100, SelectFromModel__threshold=0.2), Binarizer__threshold=0.6000000000000001), KNeighborsRegressor__n_neighbors=5, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=distance)
-3 nan KNeighborsRegressor(Binarizer(SelectFromModel(input_matrix, SelectFromModel__ExtraTreesRegressor__max_features=0.7500000000000002, SelectFromModel__ExtraTreesRegressor__n_estimators=100, SelectFromModel__threshold=0.2), Binarizer__threshold=0.6000000000000001), KNeighborsRegressor__n_neighbors=5, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=uniform)
-3 nan KNeighborsRegressor(Binarizer(SelectFromModel(input_matrix, SelectFromModel__ExtraTreesRegressor__max_features=0.7500000000000002, SelectFromModel__ExtraTreesRegressor__n_estimators=100, SelectFromModel__threshold=0.2), Binarizer__threshold=0.55), KNeighborsRegressor__n_neighbors=5, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=uniform)

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required…
…

60.14 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.

TPOT closed prematurely. Will use the current best pipeline.

ValueError Traceback (most recent call last)
in

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/utils/pkg.py in wrapper(*args, **kwargs)
102 def wrapper(*args, **kwargs):
103 args[0].is_fit = False
→ 104 result = func(*args, **kwargs)
105 args[0].is_fit = True
106 return result

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/pipeline.py in benchmark(self, df, target, kfold, fold_subset, cache, ignore)
334 test = df.iloc[test_ix].sample(frac=1)
335 train = df[~df.index.isin(test.index)].sample(frac=1)
→ 336 self.fit(train, target)
337 logger.info(“Predicting fold index {}”.format(fold))
338 test = self.predict(test, ignore=ignore)

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/utils/pkg.py in wrapper(*args, **kwargs)
102 def wrapper(*args, **kwargs):
103 args[0].is_fit = False
→ 104 result = func(*args, **kwargs)
105 args[0].is_fit = True
106 return result

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/pipeline.py in fit(self, df, target)
182 df = self.cleaner.fit_transform(df, target)
183 df = self.reducer.fit_transform(df, target)
→ 184 self.learner.fit(df, target)
185 logger.info(“MatPipe successfully fit.”)
186 self.post_fit_df = df

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/utils/log.py in wrapper(*args, **kwargs)
94 self = args[0]
95 logger.info(“{}Starting {}.”.format(self._log_prefix, operation))
—> 96 result = meth(*args, **kwargs)
97 logger.info(“{}Finished {}.”.format(self._log_prefix, operation))
98 return result

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/utils/pkg.py in wrapper(*args, **kwargs)
102 def wrapper(*args, **kwargs):
103 args[0].is_fit = False
→ 104 result = func(*args, **kwargs)
105 args[0].is_fit = True
106 return result

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/automatminer/automl/adaptors.py in fit(self, df, target, **fit_kwargs)
135 self._features = df.drop(columns=target).columns.tolist()
136 self._fitted_target = target
→ 137 self._backend = self._backend.fit(X, y, **fit_kwargs)
138 return self
139

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/tpot/base.py in fit(self, features, target, sample_weight, groups)
744 # raise the exception if it’s our last attempt
745 if attempt == (attempts - 1):
→ 746 raise e
747 return self
748

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/tpot/base.py in fit(self, features, target, sample_weight, groups)
736
737 self._update_top_pipeline()
→ 738 self._summary_of_best_pipeline(features, target)
739 # Delete the temporary cache before exiting
740 self._cleanup_memory()

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/tpot/base.py in summary_of_best_pipeline(self, features, target)
860 with warnings.catch_warnings():
861 warnings.simplefilter(‘ignore’)
→ 862 self.pareto_front_fitted_pipelines[str(pipeline)].fit(features, target)
863
864 def predict(self, features):

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
348 This estimator
349 “”"
→ 350 Xt, fit_params = self._fit(X, y, **fit_params)
351 with _print_elapsed_time(‘Pipeline’,
352 self._log_message(len(self.steps) - 1)):

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
313 message_clsname=‘Pipeline’,
314 message=self._log_message(step_idx),
→ 315 **fit_params_steps[name])
316 # Replace the transformer of the step with the fitted
317 # transformer. This is necessary when loading the transformer

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/joblib/memory.py in call(self, *args, **kwargs)
563
564 def call(self, *args, **kwargs):
→ 565 return self._cached_call(args, kwargs)[0]
566
567 def getstate(self):

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/joblib/memory.py in _cached_call(self, args, kwargs, shelving)
529
530 if must_call:
→ 531 out, metadata = self.call(*args, **kwargs)
532 if self.mmap_mode is not None:
533 # Memmap the output at the first call to be consistent with

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/joblib/memory.py in call(self, *args, **kwargs)
725 if self._verbose > 0:
726 print(format_call(self.func, args, kwargs))
→ 727 output = self.func(*args, **kwargs)
728 self.store_backend.dump_item(
729 [func_id, args_id], output, verbose=self._verbose)

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, ‘fit_transform’):
→ 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
→ 574 return self.fit(X, y, **fit_params).transform(X)
575
576

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
1946 X : array-like
1947 “”"
→ 1948 check_array(X, accept_sparse=‘csr’)
1949 return self
1950

~/anaconda3/envs/workshop-v2/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
592 " a minimum of %d is required%s."
593 % (n_features, array.shape, ensure_min_features,
→ 594 context))
595
596 if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:

ValueError: Found array with 0 feature(s) (shape=(360, 0)) while a minimum of 1 is required.

Jorge_Alonso_Delgado · September 4, 2020, 9:42am

Hi everyone!

A quick update. To evaluate the effect of the environment, I’ve created a new conda env with just the necessary:

Python 3.79
Automatminer
jupyther
Ipywidgets

This is the pip feeze:

argon2-cffi @ file:///tmp/build/80754af9/argon2-cffi_1596828452693/work
ase==3.18.1
attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work
automatminer==1.0.3.20200727
backcall==0.2.0
bleach==3.1.5
certifi==2020.6.20
cffi @ file:///tmp/build/80754af9/cffi_1598370813909/work
chardet==3.0.4
cycler==0.10.0
deap==1.3.1
decorator==4.4.2
defusedxml==0.6.0
entrypoints==0.3
future==0.18.2
idna==2.10
importlib-metadata @ file:///tmp/build/80754af9/importlib-metadata_1593446408836/work
ipykernel @ file:///tmp/build/80754af9/ipykernel_1596206598566/work/dist/ipykernel-5.3.4-py3-none-any.whl
ipython @ file:///tmp/build/80754af9/ipython_1598883837425/work
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi @ file:///tmp/build/80754af9/jedi_1596490743326/work
Jinja2==2.11.2
joblib==0.16.0
jsonschema @ file:///tmp/build/80754af9/jsonschema_1594363551272/work
jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work
jupyter-core==4.6.3
kiwisolver==1.1.0
MarkupSafe @ file:///tmp/build/80754af9/markupsafe_1594371495811/work
matminer==0.6.2
matplotlib==3.1.2
mistune @ file:///tmp/build/80754af9/mistune_1594373098390/work
mkl-fft==1.1.0
mkl-random==1.1.1
mkl-service==2.3.0
monty==4.0.0
mpmath==1.1.0
nbconvert @ file:///tmp/build/80754af9/nbconvert_1594376811065/work
nbformat==5.0.7
networkx==2.5
notebook @ file:///tmp/build/80754af9/notebook_1596838645154/work
numpy==1.17.4
packaging==20.4
palettable==3.3.0
pandas==1.1.1
pandocfilters==1.4.2
parso==0.7.0
pexpect @ file:///tmp/build/80754af9/pexpect_1594383317248/work
pickleshare @ file:///tmp/build/80754af9/pickleshare_1594384075987/work
Pint==0.15
plotly==4.9.0
prometheus-client==0.8.0
prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1598885458782/work
ptyprocess==0.6.0
pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
PyDispatcher==2.0.5
Pygments==2.6.1
pymatgen==2020.1.28
pymongo==3.11.0
pyparsing==2.4.5
pyrsistent==0.16.0
python-dateutil==2.8.1
pytz==2020.1
PyYAML==5.1.2
pyzmq==19.0.1
requests==2.24.0
retrying==1.3.3
ruamel.yaml==0.16.10
ruamel.yaml.clib==0.2.0
scikit-learn==0.22.2
scipy==1.3.3
Send2Trash==1.5.0
six==1.13.0
skrebate==0.6
spglib==1.16.0
stopit==1.1.2
sympy==1.6.2
tabulate==0.8.7
terminado==0.8.3
testpath==0.4.4
tornado==6.0.4
TPOT==0.11.0
tqdm==4.48.2
traitlets==4.3.3
update-checker==0.18.0
urllib3==1.25.10
wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
webencodings==0.5.1
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1594164347302/work
zipp==3.1.0

System info as described above:
• OS: Ubuntu 18.04.3 LTS (as a virtual machine in Windows 10 v 1909 as the host)
• Python version: 3.6.10 (h7579374_2) installed initially from Anaconda on a new environment. The environment was initially setup with the configuration of the MaterialsProject Workshop 2020 as stated in the files provided in the github repository.
• jupyter version: jupyter-notebook : 6.0.3 (ipython: 7.16.1)

So I launched again “express” presets but still very often keep finding final errors:

ValueError: Found array with 0 feature(s) (shape=(360, 0)) while a minimum of 1 is required.

My approach so far to face this issue is to launch simultaneous tests (e.g. 10) and eventually one of these finishes.

I wonder if the practice of running simultaneous tests is not recommendable at all (may be with these I’m spoiling myself).

Still I would like to get ideas if the “array with 0 feature(s)” (when working with linear models) and the eventual crashes (or not) is something lets say “normal” based on the nature of TPOT, or if there is something wrong on my side.

Any recommendation or best practice (if you notice something weird in my methods), would be very welcome.

Thanks again for the help.

Regards,

Jorge

ardunn · September 12, 2020, 7:04pm

Hey Jorge,

Thanks for letting me know. I am working on a solution for this issue and will update this thread when it is done!

Thanks,
Alex

Jorge_Alonso_Delgado · November 30, 2020, 7:28am

Hi everybody,

After a few months since this topic was created, here I bring an update and possibly a partial solution related to ValueError(s) of the type:

ValueError: Unsupported set of arguments:
ValueError: x and y arrays must have at least 2 entries
ValueError: X contains negative values
ValueError: Found array with 0 feature(s)

when running benchmark methods in automatminer.

The final ValueError appeared to be aleatory, however in all the cases there were recurrent alerts recorded in the log file of the type:

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s)

As mentioned also by @Krkaufma in a related post, I also started thinking that there might be specific operators included in the default configuration of TPOT responsible of those aleatory crashes, and if detected we might succeed in completing autoML with express, production or heavy presets.

My approach was then to create a customized pipeline where I focused in the customization of the learner (config[“learner”]) where the kwargs passed into TPOT belonged from a customized config_dict. Such dictionary started having a few basic operators (e.g. only one regressor – one preprocessor – one reducer), and then included progressively operators, checking after every inclusion the stability of the run (my criteria was the absence of the “_pre_test decorator:…” alarm that used to appear during the first minutes of each run.
Doing so, out of the 27 operators included in the default express (production or heavy) preset, there were 3 operators which presence resulted in the appearance of the “pre_test decorator” alarm, while the other 24 were stable and the runs were completed successfully. The problematic 3 operators were the following:

‘sklearn.svm.LinearSVR’:
‘sklearn.feature_selection.SelectFwe’:
‘sklearn.feature_selection.SelectFromModel’:

That’s it!. The exclusion of these three operators worked for me, but It might be necessary further adjustment of the operator package depending on the specific dataset.
If somebody get further insights or a more elegant solution to these types of errors, I would be happy to hear about it here.

Below you can find the important pieces of code that I used to customize the pipe (learner) and the config_dict of TPOTAdaptor, ready to use for who would like to test this approach. Except for the config_dict, the configuration of the TPOTAdaptor showed below is basically the one of the default “production” preset in automaminer:

config_dict_1={'sklearn.ensemble.RandomForestRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
                                                              'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
                                                              'min_samples_split': range(2, 21, 3),
                                                              'min_samples_leaf': range(1, 21, 3),
                                                              'bootstrap': [True, False]},

               'sklearn.ensemble.GradientBoostingRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
                                                              'loss': ['ls', 'lad', 'huber', 'quantile'],
                                                              'learning_rate': [0.01, 0.1, 0.5, 1.0],
                                                              'max_depth': range(1, 11, 2),
                                                              'min_samples_split': range(2, 21, 3),
                                                              'min_samples_leaf': range(1, 21, 3),
                                                              'subsample': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
                                                                                  0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
                                                              'max_features': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
                                                                                     0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
                                                              'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},

               'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
                                                        'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
                                                        'min_samples_split': range(2, 21, 3),
                                                        'min_samples_leaf': range(1, 21, 3),
                                                        'bootstrap': [True, False]},

               'sklearn.tree.DecisionTreeRegressor': {'max_depth': range(1, 11, 2),
                                                      'min_samples_split': range(2, 21, 3),
                                                      'min_samples_leaf': range(1, 21, 3)},  

               'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101),
                                                         'weights': ['uniform', 'distance'],
                                                         'p': [1, 2]}, 

               'sklearn.linear_model.Lasso': {'alpha': [1e-2, 1e-1, 1e0, 1e1, 1e2]}, #J alpha values taken from Takigawa-2019

               'sklearn.linear_model.LassoLarsCV': {'normalize': [True, False]},

               'sklearn.linear_model.RidgeCV': {},

               'sklearn.linear_model.ElasticNetCV': {'l1_ratio': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ],
                                                     'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},                 



               'sklearn.preprocessing.MaxAbsScaler': {},
               'sklearn.preprocessing.RobustScaler': {},
               'sklearn.preprocessing.StandardScaler': {},
               'sklearn.preprocessing.MinMaxScaler': {},
               'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']},  


               'sklearn.preprocessing.PolynomialFeatures': {'degree': [2],
                                                           'include_bias': [False],
                                                           'interaction_only': [False]},  

               'sklearn.kernel_approximation.RBFSampler': {'gamma': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]},                


               'sklearn.kernel_approximation.Nystroem': {'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2','sigmoid'],
                                                         'gamma': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ],
                                                         'n_components': range(1, 11)},

               'tpot.builtins.ZeroCount': {},

               'tpot.builtins.OneHotEncoder': {'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
                                               'sparse': [False],                
                                               'threshold': [10]},

               'sklearn.preprocessing.Binarizer': {'threshold': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
                      0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]},

               'sklearn.cluster.FeatureAgglomeration': {'linkage': ['ward', 'complete', 'average'],
                                                        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']},




               'sklearn.feature_selection.SelectPercentile': {'percentile': range(1, 100),
                                                              'score_func': {'sklearn.feature_selection.f_regression': None}},

                                                                                                                   
               'sklearn.decomposition.PCA': {'svd_solver': ['randomized'],
                                             'iterated_power': range(1, 11)},

               'sklearn.decomposition.FastICA': {'tol': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
                                                         0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]},


               'sklearn.feature_selection.VarianceThreshold': {'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}}


from automatminer.pipeline import MatPipe
config = get_preset_config("production")

config["learner"] = TPOTAdaptor(max_time_mins=1440, 
                                max_eval_time_mins=20,
                                cv=5, 
                                verbosity=3,
                                memory='auto',
                                template='Selector-Transformer-Regressor',
                                scoring='neg_mean_absolute_error',
                                config_dict=config_dict_1)
pipe = MatPipe(**config)

predicted_folds = pipe.benchmark(df=df, target=target, kfold=kf)

sgbaird · July 8, 2021, 6:19am

Note that you also need

from automatminer.presets import get_preset_config
from automatminer.automl.adaptors import TPOTAdaptor