How do i download hamlet.txt
used in the challenge?
I tried to run the challenge on local jupyter to realize i don’t have the file, then i realized what should be a FileNotFoundError
in normal python is so unreadable in pyspark.
Using DQ code in local jupyter without the hamlet.txt
:
raw_hamlet = sc.textFile("hamlet.txt")
split_hamlet = raw_hamlet.map(lambda line: line.split('\t'))
split_hamlet.take(5)
def format_id(x):
ida = x[0].split('@')[1]
results = list()
results.append(ida)
if len(x) > 1:
for y in x[1:]:
results.append(y)
return results
hamlet_with_ids = split_hamlet.map(lambda line: format_id(line))
hamlet_with_ids.take(10)
The error is below.
I can see the arrow pointing to ----> 3 split_hamlet.take(5)
so it seems that the error message only points to action
steps rather than transformation
steps in spark. I tested this by adding raw_hamlet.take()
right after read and indeed the error pointer moved up to that.
Is my hypothesis that spark errors only point to actions correct? If yes how do we narrow the debugging area? (tee-ing .takes()
throughout the transformation steps doesn’t feel right)
Py4JJavaError Traceback (most recent call last)
<ipython-input-25-4e5352d524d2> in <module>
1 raw_hamlet = sc.textFile("hamlet.txt")
2 split_hamlet = raw_hamlet.map(lambda line: line.split('\t'))
----> 3 split_hamlet.take(5)
4 def format_id(x):
5 ida = x[0].split('@')[1]
~\spark\spark-2.4.5-bin-hadoop2.7\python\pyspark\rdd.py in take(self, num)
1325 """
1326 items = []
-> 1327 totalParts = self.getNumPartitions()
1328 partsScanned = 0
1329
~\spark\spark-2.4.5-bin-hadoop2.7\python\pyspark\rdd.py in getNumPartitions(self)
2515
2516 def getNumPartitions(self):
-> 2517 return self._prev_jrdd.partitions().size()
2518
2519 @property
~\spark\spark-2.4.5-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~\spark\spark-2.4.5-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o283.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/C:/Users/hanqi/AIAP/hamlet.txt
at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:61)
at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
at sun.reflect.GeneratedMethodAccessor69.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)