Multi line JSON parser using Spark Dataframe
Input file
tags_sample.json:
-----------------------
{
"stackoverflow": [{
"tag": {
"id": 1,
"name": "scala",
"author": "Martin Odersky",
"frameworks": [
{
"id": 1,
"name": "Play Framework"
},
{
"id": 2,
"name": "Akka Framework"
}
]
}
},
{
"tag": {
"id": 2,
"name": "java",
"author": "James Gosling",
"frameworks": [
{
"id": 1,
"name": "Apache Tomcat"
},
{
"id": 2,
"name": "Spring Boot"
}
]
}
}
]
}
scala> import spark.sqlContext.implicits._
import spark.sqlContext.implicits._
tagsDF: org.apache.spark.sql.DataFrame = [stackoverflow: array<struct<tag:struct<author:string,frameworks:array<struct<id:bigint,name:string>>,id:bigi
nt,name:string>>>]
scala>
scala> tagsDF.printSchema
root
|-- stackoverflow: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- tag: struct (nullable = true)
| | | |-- author: string (nullable = true)
| | | |-- frameworks: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: long (nullable = true)
| | | | | |-- name: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- name: string (nullable = true)
scala> tagsDF.show // It is not correct
+--------------------+
| stackoverflow|
+--------------------+
|[[[Martin Odersky...|
+--------------------+
df: org.apache.spark.sql.DataFrame = [stags: struct<tag: struct<author: string, frameworks:
root
|-- stags: struct (nullable = true)
| |-- tag: struct (nullable = true)
| | |-- author: string (nullable = true)
| | |-- frameworks: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: long (nullable = true)
| | | | |-- name: string (nullable = true)
| | |-- id: long (nullable = true)
| | |-- name: string (nullable = true)
scala> val df1 = df.select(
| $"stags.tag.id" as "id",
| $"stags.tag.author" as "author",
| $"stags.tag.name" as "tag_name",
| $"stags.tag.frameworks.id" as "frameworks_id",
| $"stags.tag.frameworks.name" as "frameworks_name"
| )
df1: org.apache.spark.sql.DataFrame = [id: bigint, author: string ... 3 more fields]
scala> df1.show
+---+--------------+--------+-------------+--------------------+
| id| author|tag_name|frameworks_id| frameworks_name|
+---+--------------+--------+-------------+--------------------+
| 1|Martin Odersky| scala| [1, 2]|[Play Framework, ...|
| 2| James Gosling| java| [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+
tags_sample.json:
-----------------------
{
"stackoverflow": [{
"tag": {
"id": 1,
"name": "scala",
"author": "Martin Odersky",
"frameworks": [
{
"id": 1,
"name": "Play Framework"
},
{
"id": 2,
"name": "Akka Framework"
}
]
}
},
{
"tag": {
"id": 2,
"name": "java",
"author": "James Gosling",
"frameworks": [
{
"id": 1,
"name": "Apache Tomcat"
},
{
"id": 2,
"name": "Spring Boot"
}
]
}
}
]
}
scala> import spark.sqlContext.implicits._
import spark.sqlContext.implicits._
scala> val tagsDF = spark.read.option("multiLine", true).option("inferSchema", true).format("json").load("E:\\DataSets\\tags_sample.json")
nt,name:string>>>]
scala>
scala> tagsDF.printSchema
root
|-- stackoverflow: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- tag: struct (nullable = true)
| | | |-- author: string (nullable = true)
| | | |-- frameworks: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: long (nullable = true)
| | | | | |-- name: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- name: string (nullable = true)
scala> tagsDF.show // It is not correct
+--------------------+
| stackoverflow|
+--------------------+
|[[[Martin Odersky...|
+--------------------+
scala> val df = tagsDF.select(explode($"stackoverflow") as "stags")
scala> df.printSchema
|-- stags: struct (nullable = true)
| |-- tag: struct (nullable = true)
| | |-- author: string (nullable = true)
| | |-- frameworks: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: long (nullable = true)
| | | | |-- name: string (nullable = true)
| | |-- id: long (nullable = true)
| | |-- name: string (nullable = true)
scala> val df1 = df.select(
| $"stags.tag.id" as "id",
| $"stags.tag.author" as "author",
| $"stags.tag.name" as "tag_name",
| $"stags.tag.frameworks.id" as "frameworks_id",
| $"stags.tag.frameworks.name" as "frameworks_name"
| )
df1: org.apache.spark.sql.DataFrame = [id: bigint, author: string ... 3 more fields]
scala> df1.show
+---+--------------+--------+-------------+--------------------+
| id| author|tag_name|frameworks_id| frameworks_name|
+---+--------------+--------+-------------+--------------------+
| 1|Martin Odersky| scala| [1, 2]|[Play Framework, ...|
| 2| James Gosling| java| [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+
No comments:
Post a Comment