I'm trying to execute an expample:
spark Version: 2.3.4
import timefrom sklearn import datasets, svm
from skdist.distribute.search import DistGridSearchCV
from pyspark.sql import SparkSession # instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext # the digits dataset
digits = datasets.load_digits()
X = digits["data"]
y = digits["target"] # create a classifier: a support vector classifier
classifier = svm.SVC()
param_grid = {
"C": [0.01, 0.01, 0.1, 1.0, 10.0, 20.0, 50.0],
"gamma": ["scale", "auto", 0.001, 0.01, 0.1],
"kernel": ["rbf", "poly", "sigmoid"]
}
scoring = "f1_weighted"
cv = 10# hyperparameter optimization
start = time.time()
model = DistGridSearchCV(
classifier, param_grid,
sc=sc, cv=cv, scoring=scoring,
verbose=True
)
when I try to train my model, model.fit(X,y) it's fails with
OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: 'C:\C:\spark\jars\spark-core_2.11-2.3.4.jar'
The Spak_Home is set as "C:\spark" and the PATH %SPARK_HOME%\bin
Without SparkContext I'm able to run the code
model = DistGridSearchCV(
classifier, param_grid,
**sc=sc**, cv=cv, scoring=scoring)
to:
model = DistGridSearchCV(
classifier, param_grid,
cv=cv, scoring=scoring)
I did also try to pass the "spakHome" variable when defnied the SparkContext without the driver letter "C:"
SparkContext(appName="Dist_Exmp",sparkHome="spark")
sc.sparkHome --> spark
But the variable is taken from the EvnVar.
Here the whole Trace:
`---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
in
----> 1 model.fit(X_train, y_train)
C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py in fit(self, X, y, groups, **fit_params)
367 base_estimator_ = self.sc.broadcast(base_estimator)
368 partitions = _parse_partitions(self.partitions, len(fit_sets))
--> 369 out = self.sc.parallelize(fit_sets, numSlices=partitions).map(lambda x: [x[0], fit_and_score(
370 base_estimator, X, y, scorers, x[2][0], x[2][1],
371 verbose, x[1], fit_params=fit_params,
C:\MeineProgramme\anaconda3\lib\site-packages\pyspark\rdd.py in collect(self)
812 """
813 with SCCallSiteSync(self.context) as css:
--> 814 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
815 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
816
C:\MeineProgramme\anaconda3\lib\site-packages\py4j\java_gateway.py in call(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
C:\MeineProgramme\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\MeineProgramme\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 1.0 failed 1 times, most recent failure: Lost task 1.0 in stage 1.0 (TID 6, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py", line 240, in main
File "C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py", line 60, in read_command
File "C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py", line 171, in read_with_length
return self.loads(obj)
File "C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py", line 566, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py", line 14, in
from sklearn.model_selection import (
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_init.py", line 19, in
from .validation import cross_val_score
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_validation.py", line 27, in
from ..metrics.scorer import check_scoring, check_multimetric_scoring
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics_init.py", line 7, in
from .ranking import auc
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics\ranking.py", line 35, in
from ..preprocessing import label_binarize
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_init.py", line 6, in
from .function_transformer import FunctionTransformer
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_function_transformer.py", line 5, in
from ..utils.testing import assert_allclose_dense_sparse
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in
import pytest
File "C:\MeineProgramme\anaconda3\lib\site-packages\pytest.py", line 6, in
from pytest.assertion import register_assert_rewrite
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion_init.py", line 7, in
from pytest.assertion import rewrite
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\rewrite.py", line 26, in
from pytest.assertion import util
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\util.py", line 8, in
import pytest.code
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code_init.py", line 2, in
from .code import Code # noqa
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code\code.py", line 23, in
import pluggy
File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy_init.py", line 16, in
from .manager import PluginManager, PluginValidationError
File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy\manager.py", line 11, in
import importlib_metadata
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 547, in
version = version(name)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 509, in version
return distribution(distribution_name).version
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py", line 183, in from_name
dist = next(dists, None)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py", line 425, in
for path in map(cls.switch_path, paths)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 449, in _search_path
if not root.is_dir():
File "C:\MeineProgramme\anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\MeineProgramme\anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: 'C:\C:\spark\jars\spark-core_2.11-2.3.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:336)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:475)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:458)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:290)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1661)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1649)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1648)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1648)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1882)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1820)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:165)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py", line 240, in main
File "C:\SPARK\python\lib\pyspark.zip\pyspark\worker.py", line 60, in read_command
File "C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py", line 171, in read_with_length
return self.loads(obj)
File "C:\SPARK\python\lib\pyspark.zip\pyspark\serializers.py", line 566, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\MeineProgramme\anaconda3\lib\site-packages\skdist\distribute\search.py", line 14, in
from sklearn.model_selection import (
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_init.py", line 19, in
from .validation import cross_val_score
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\model_selection_validation.py", line 27, in
from ..metrics.scorer import check_scoring, check_multimetric_scoring
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics_init.py", line 7, in
from .ranking import auc
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\metrics\ranking.py", line 35, in
from ..preprocessing import label_binarize
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_init.py", line 6, in
from .function_transformer import FunctionTransformer
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\preprocessing_function_transformer.py", line 5, in
from ..utils.testing import assert_allclose_dense_sparse
File "C:\MeineProgramme\anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in
import pytest
File "C:\MeineProgramme\anaconda3\lib\site-packages\pytest.py", line 6, in
from pytest.assertion import register_assert_rewrite
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion_init.py", line 7, in
from pytest.assertion import rewrite
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\rewrite.py", line 26, in
from pytest.assertion import util
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest\assertion\util.py", line 8, in
import pytest.code
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code_init.py", line 2, in
from .code import Code # noqa
File "C:\MeineProgramme\anaconda3\lib\site-packages_pytest_code\code.py", line 23, in
import pluggy
File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy_init.py", line 16, in
from .manager import PluginManager, PluginValidationError
File "C:\MeineProgramme\anaconda3\lib\site-packages\pluggy\manager.py", line 11, in
import importlib_metadata
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 547, in
version = version(name)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 509, in version
return distribution(distribution_name).version
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py", line 183, in from_name
dist = next(dists, None)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init_.py", line 425, in
for path in map(cls.switch_path, paths)
File "C:\MeineProgramme\anaconda3\lib\site-packages\importlib_metadata_init.py", line 449, in _search_path
if not root.is_dir():
File "C:\MeineProgramme\anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\MeineProgramme\anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] Die Syntax für den Dateinamen, Verzeichnisnamen oder die Datenträgerbezeichnung ist falsch: 'C:\C:\spark\jars\spark-core_2.11-2.3.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:336)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:475)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:458)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:290)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more`
bug