In [2]:
sc.version
Out[2]:
u'1.3.1'
In [13]:
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
In [3]:
x =  SparseVector(2, {1:1, 2:2, 3:3, 4:4, 5:5})
In [10]:
l = LabeledPoint(0, x)
In [12]:
r = sc.parallelize([l])
In [14]:
m = LogisticRegressionWithSGD.train(r)
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-14-ab497c22ec7c> in <module>()
----> 1 m = LogisticRegressionWithSGD.train(r)

/usr/iop/4.0.0.0/spark/python/pyspark/mllib/classification.py in train(cls, data, iterations, step, miniBatchFraction, initialWeights, regParam, regType, intercept)
    162                                  bool(intercept))
    163 
--> 164         return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights)
    165 
    166 

/usr/iop/4.0.0.0/spark/python/pyspark/mllib/regression.py in _regression_train_wrapper(train_func, modelClass, data, initial_weights)
    138     if initial_weights is None:
    139         initial_weights = [0.0] * len(data.first().features)
--> 140     weights, intercept = train_func(data, _convert_to_vector(initial_weights))
    141     return modelClass(weights, intercept)
    142 

/usr/iop/4.0.0.0/spark/python/pyspark/mllib/classification.py in train(rdd, i)
    160             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
    161                                  float(step), float(miniBatchFraction), i, float(regParam), regType,
--> 162                                  bool(intercept))
    163 
    164         return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights)

/usr/iop/4.0.0.0/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
    118     sc = SparkContext._active_spark_context
    119     api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 120     return callJavaFunc(sc, api, *args)
    121 
    122 

/usr/iop/4.0.0.0/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
    111     """ Call Java Function """
    112     args = [_py2java(sc, a) for a in args]
--> 113     return _java2py(sc, func(*args))
    114 
    115 

/usr/iop/4.0.0.0/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/usr/iop/4.0.0.0/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling o86.trainLogisticRegressionModelWithSGD.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 11.0 failed 1 times, most recent failure: Lost task 7.0 in stage 11.0 (TID 47, localhost): java.lang.ArrayIndexOutOfBoundsException: 2
	at org.apache.spark.mllib.linalg.BLAS$.dot(BLAS.scala:136)
	at org.apache.spark.mllib.linalg.BLAS$.dot(BLAS.scala:106)
	at org.apache.spark.mllib.optimization.LogisticGradient.compute(Gradient.scala:169)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$runMiniBatchSGD$1$$anonfun$1.apply(GradientDescent.scala:192)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$runMiniBatchSGD$1$$anonfun$1.apply(GradientDescent.scala:190)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:144)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:201)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1157)
	at org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:988)
	at org.apache.spark.rdd.RDD$$anonfun$28.apply(RDD.scala:988)
	at org.apache.spark.rdd.RDD$$anonfun$29.apply(RDD.scala:989)
	at org.apache.spark.rdd.RDD$$anonfun$29.apply(RDD.scala:989)
	at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
	at org.apache.spark.rdd.RDD$$anonfun$14.apply(RDD.scala:634)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:64)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)