Saturday, March 30, 2019

Space into Comma Transformation for Dataframe data using Spark with Scala

Space into Comma Transformation for Dataframe data using Spark with Scala



// Transforming Space into Comma in Dataframe data
scala> val obj = List( (1001, "7 2234 2342 2522"), (1002, "2222 2223 2224 2225"), (1003, "2000 2001 2002 2003 2004"), (1004, "2005 2006 2000 7 2001 2010"))
obj: List[(Int, String)] = List((1001,7 2234 2342 2522), (1002,2222 2223 2224 2225), (1003,2000 2001 2002 2003 2004), (1004,2005 2006 2000 7 2001 2010))


scala> val r1 = sc.parallelize(obj);
r1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[1] at parallelize at :26

scala>  r1.foreach(println)
(1001,7 2234 2342 2522)
(1002,2222 2223 2224 2225)
(1003,2000 2001 2002 2003 2004)
(1004,2005 2006 2000 7 2001 2010)


 case class UserStory(userid:String, storyid:String)

 val r2 = r1.map (x => {
       val userid = x._1.toString
       val storyid = x._2.toString
       UserStory(userid,storyid)
       })
 
scala> r2.foreach(println)
UserStory(1001,7 2234 2342 2522)
UserStory(1002,2222 2223 2224 2225)
UserStory(1003,2000 2001 2002 2003 2004)
UserStory(1004,2005 2006 2000 7 2001 2010)


scala> val df = r2.toDF
df: org.apache.spark.sql.DataFrame = [userid: string, storyid: string]

scala> df.printSchema
root
 |-- userid: string (nullable = true)
 |-- storyid: string (nullable = true)


scala> df.show
+------+--------------------+
|userid|             storyid|
+------+--------------------+
|  1001|    7 2234 2342 2522|
|  1002| 2222 2223 2224 2225|
|  1003|2000 2001 2002 20...|
|  1004|2005 2006 2000 7 ...|
+------+--------------------+

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala> df.select(col("userid") as "user id",regexp_replace(col("storyid")," ",",") as "story id").show
+-------+--------------------+
|user id|            story id|
+-------+--------------------+
|   1001|    7,2234,2342,2522|
|   1002| 2222,2223,2224,2225|
|   1003|2000,2001,2002,20...|
|   1004|2005,2006,2000,7,...|
+-------+--------------------+

No comments:

Post a Comment

Python Challenges Program

Challenges program: program 1: #Input :ABAABBCA #Output: A4B3C1 str1="ABAABBCA" str2="" d={} for x in str1: d[x]=d...