Space into Comma Transformation for Dataframe data using Spark with Scala
// Transforming Space into Comma in Dataframe data
scala> val obj = List( (1001, "7 2234 2342 2522"), (1002, "2222 2223 2224 2225"), (1003, "2000 2001 2002 2003 2004"), (1004, "2005 2006 2000 7 2001 2010"))
obj: List[(Int, String)] = List((1001,7 2234 2342 2522), (1002,2222 2223 2224 2225), (1003,2000 2001 2002 2003 2004), (1004,2005 2006 2000 7 2001 2010))
scala> val r1 = sc.parallelize(obj);
r1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[1] at parallelize at:26
scala> r1.foreach(println)
(1001,7 2234 2342 2522)
(1002,2222 2223 2224 2225)
(1003,2000 2001 2002 2003 2004)
(1004,2005 2006 2000 7 2001 2010)
case class UserStory(userid:String, storyid:String)
val r2 = r1.map (x => {
val userid = x._1.toString
val storyid = x._2.toString
UserStory(userid,storyid)
})
scala> r2.foreach(println)
UserStory(1001,7 2234 2342 2522)
UserStory(1002,2222 2223 2224 2225)
UserStory(1003,2000 2001 2002 2003 2004)
UserStory(1004,2005 2006 2000 7 2001 2010)
scala> val df = r2.toDF
df: org.apache.spark.sql.DataFrame = [userid: string, storyid: string]
scala> df.printSchema
root
|-- userid: string (nullable = true)
|-- storyid: string (nullable = true)
scala> df.show
+------+--------------------+
|userid| storyid|
+------+--------------------+
| 1001| 7 2234 2342 2522|
| 1002| 2222 2223 2224 2225|
| 1003|2000 2001 2002 20...|
| 1004|2005 2006 2000 7 ...|
+------+--------------------+
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> df.select(col("userid") as "user id",regexp_replace(col("storyid")," ",",") as "story id").show
+-------+--------------------+
|user id| story id|
+-------+--------------------+
| 1001| 7,2234,2342,2522|
| 1002| 2222,2223,2224,2225|
| 1003|2000,2001,2002,20...|
| 1004|2005,2006,2000,7,...|
+-------+--------------------+
scala> val obj = List( (1001, "7 2234 2342 2522"), (1002, "2222 2223 2224 2225"), (1003, "2000 2001 2002 2003 2004"), (1004, "2005 2006 2000 7 2001 2010"))
obj: List[(Int, String)] = List((1001,7 2234 2342 2522), (1002,2222 2223 2224 2225), (1003,2000 2001 2002 2003 2004), (1004,2005 2006 2000 7 2001 2010))
scala> val r1 = sc.parallelize(obj);
r1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[1] at parallelize at
scala> r1.foreach(println)
(1001,7 2234 2342 2522)
(1002,2222 2223 2224 2225)
(1003,2000 2001 2002 2003 2004)
(1004,2005 2006 2000 7 2001 2010)
case class UserStory(userid:String, storyid:String)
val r2 = r1.map (x => {
val userid = x._1.toString
val storyid = x._2.toString
UserStory(userid,storyid)
})
scala> r2.foreach(println)
UserStory(1001,7 2234 2342 2522)
UserStory(1002,2222 2223 2224 2225)
UserStory(1003,2000 2001 2002 2003 2004)
UserStory(1004,2005 2006 2000 7 2001 2010)
scala> val df = r2.toDF
df: org.apache.spark.sql.DataFrame = [userid: string, storyid: string]
scala> df.printSchema
root
|-- userid: string (nullable = true)
|-- storyid: string (nullable = true)
scala> df.show
+------+--------------------+
|userid| storyid|
+------+--------------------+
| 1001| 7 2234 2342 2522|
| 1002| 2222 2223 2224 2225|
| 1003|2000 2001 2002 20...|
| 1004|2005 2006 2000 7 ...|
+------+--------------------+
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> df.select(col("userid") as "user id",regexp_replace(col("storyid")," ",",") as "story id").show
+-------+--------------------+
|user id| story id|
+-------+--------------------+
| 1001| 7,2234,2342,2522|
| 1002| 2222,2223,2224,2225|
| 1003|2000,2001,2002,20...|
| 1004|2005,2006,2000,7,...|
+-------+--------------------+
No comments:
Post a Comment