PySpark does not work correctly with Jupyter

I installed PySpark 2.4.5 with Python 3.7 and Java 14.0.
In Jupyter everything works fine getting the data from hamlet and showing the lines with .take(5)

My Code:

def filter_hamlet_speaks(line):
    if "HAMLET" in line:
        return True
    else:
        return False

hamlet_spoken_lines = split_hamlet.filter(lambda line: filter_hamlet_speaks(line))
hamlet_spoken_lines.take(5)

[[‘[email protected]’, ‘’, ‘HAMLET’],
[‘[email protected]’, ‘HAMLET’, ‘son to the late, and nephew to the present king.’],
[‘[email protected]’, ‘’, ‘HAMLET’],
[‘[email protected]’, ‘’, ‘HAMLET’],
[‘[email protected]’,
‘HAMLET’,
‘[Aside] A little more than kin, and less than kind.’]]

Using the function .count() or collect() throws errors

spoken_count = 0
spoken_count = hamlet_spoken_lines.count()

Any ideas?


Py4JJavaError Traceback (most recent call last)
in
1 spoken_count = 0
----> 2 spoken_count = hamlet_spoken_lines.count()

~/opt/spark-2.4.5/python/pyspark/rdd.py in count(self)
1053 3
1054 “”"
-> 1055 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1056
1057 def stats(self):

~/opt/spark-2.4.5/python/pyspark/rdd.py in sum(self)
1044 6.0
1045 “”"
-> 1046 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1047
1048 def count(self):

~/opt/spark-2.4.5/python/pyspark/rdd.py in fold(self, zeroValue, op)
915 # zeroValue provided to each partition is unique from the one provided
916 # to the final reduce call
–> 917 vals = self.mapPartitions(func).collect()
918 return reduce(op, vals, zeroValue)
919

~/opt/spark-2.4.5/python/pyspark/rdd.py in collect(self)
814 “”"
815 with SCCallSiteSync(self.context) as css:
–> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818

~/opt/spark-2.4.5/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in call(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:

~/opt/spark-2.4.5/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 “An error occurred while calling {0}{1}{2}.\n”.
–> 328 format(target_id, “.”, name), value)
329 else:
330 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: java.lang.IllegalArgumentException: Unsupported class file major version 58
at org.apache.xbean.asm6.ClassReader.(ClassReader.java:166)
at org.apache.xbean.asm6.ClassReader.(ClassReader.java:148)
at org.apache.xbean.asm6.ClassReader.(ClassReader.java:136)
at org.apache.xbean.asm6.ClassReader.(ClassReader.java:237)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:49)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:517)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:500)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:236)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:134)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:500)
at org.apache.xbean.asm6.ClassReader.readCode(ClassReader.java:2175)
at org.apache.xbean.asm6.ClassReader.readMethod(ClassReader.java:1238)
at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:631)
at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:355)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:307)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:306)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:306)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2100)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:832)

Solution:
Spark, PySpark obviously requires Java Version 1.8. Newer ones don’t work. After installing jdk1.8 and adding the following lines in the bash_profilefile everything works fine.

export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_251.jdk/Contents/Home
export JAVA_HOME="$(/usr/libexec/java_home -v 1.8)"

2 Likes