Hive Queries Vs Dataframe Queries - Part 1
hadoop@hadoop:~/Desktop/vow$ touch emp.txt
hadoop@hadoop:~/Desktop/vow$ atom emp.txt
101,Sathya,1000
102,Shanthi,2000
103,Mani,3000
104,Kalai,4000
105,Aruvi,5000
106,Nila,1500
107,Praveen,2500
108,Rashee,7500
109,Pinki,3500
110,Ravi,2500
pwd : /home/hadoop/Desktop/vow
hive> create database learning;
OK
Time taken: 0.901 seconds
hive> use learning;
OK
Time taken: 0.08 seconds
hive> create external table emp(id int, name varchar(50), salary int) row format delimited fields terminated by ',';
hive> load data local inpath '/home/hadoop/Desktop/vow/emp.txt' into table emp;
hive> select * from emp;
OK
101 Sathya 1000
102 Shanthi 2000
103 Mani 3000
104 Kalai 4000
105 Aruvi 5000
106 Nila 1500
107 Praveen 2500
108 Rashee 7500
109 Pinki 3500
110 Ravi 2500
Time taken: 0.305 seconds, Fetched: 10 row(s)
scala> val empSchema = StructType(StructField("id",IntegerType,true)::StructField("name",StringType,true)::StructField("salary",IntegerType,true)::Nil)
empSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(salary,IntegerType,true))
scala> val df = spark.read.format("csv").option("header","false").schema(empSchema).load("/home/hadoop/Desktop/vow/emp.txt");
df: org.apache.spark.sql.DataFrame = [id: int, name: string ... 1 more field]
scala> df.printSchema
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- salary: integer (nullable = true)
hive> select * from emp;
OK
101 Sathya 1000
102 Shanthi 2000
103 Mani 3000
104 Kalai 4000
105 Aruvi 5000
106 Nila 1500
107 Praveen 2500
108 Rashee 7500
109 Pinki 3500
110 Ravi 2500
scala> df.show
+---+-------+------+
| id| name|salary|
+---+-------+------+
|101| Sathya| 1000|
|102|Shanthi| 2000|
|103| Mani| 3000|
|104| Kalai| 4000|
|105| Aruvi| 5000|
|106| Nila| 1500|
|107|Praveen| 2500|
|108| Rashee| 7500|
|109| Pinki| 3500|
|110| Ravi| 2500|
+---+-------+------+
hive> select max(salary) from emp;
7500
scala> df.select(max(df("salary")) as "Salary").show
or
scala> df.select(max($"salary") as "Salary").show
+------+
|Salary|
+------+
| 7500|
+------+
hive> select max(salary),min(salary) from emp;
OK
7500 1000
scala> df.select(max(df("salary")) as "MaxSal", min(df("salary")) as "MinSal").show
df.select(max($"salary") as "MaxSal",min($"salary") as "MinSal").show
+------+------+
|MaxSal|MinSal|
+------+------+
| 7500| 1000|
+------+------+
hive> select salary from emp order by salary;
OK
1000
1500
2000
2500
2500
3000
3500
4000
5000
7500
scala> df.select(df("salary")).orderBy("salary").show
df.select($"salary").orderBy($"salary").show
+------+
|salary|
+------+
| 1000|
| 1500|
| 2000|
| 2500|
| 2500|
| 3000|
| 3500|
| 4000|
| 5000|
| 7500|
+------+
hive> select salary from emp order by salary desc;
OK
7500
5000
4000
3500
3000
2500
2500
2000
1500
1000
import org.apache.spark.sql.functions._
scala> df.select(df("salary")).orderBy(desc("salary")).show
or
scala> df.select($"salary").orderBy($"salary".desc).show
+------+
|salary|
+------+
| 7500|
| 5000|
| 4000|
| 3500|
| 3000|
| 2500|
| 2500|
| 2000|
| 1500|
| 1000|
+------+
hive> select sum(salary) from emp;
OK
32500
scala> df.select(sum("salary") as "Sum").show
or
scala> df.select(sum($"salary") as "Sum").show
+-----+
| Sum|
+-----+
|32500|
+-----+
hadoop@hadoop:~$ touch emp.txt
hadoop@hadoop:~$ atom emp.txt
id,name,gender,salary,deptid
100,Ravi,m,1000,10
101,Rani,f,2000,11
102,Suresh,m,3000,12
103,Rahul,m,1250,10
104,Rashee,f,3500,11
105,Priya,f,3600,12
106,Ayeesha,f,4000,10
107,Aruvi,f,2500,11
108,Arushi,f,2800,12
109,Vinay,m,3200,10
110,Kalai,f,2550,11
111,Shilpa,f,2600,12
hadoop@hadoop:~$ atom dept.txt
hadoop@hadoop:~$ atom dept.txt
deptid,deptname
10,Marketing
11,Sales
12,Production
// tblproperties("skip.header.line.count"="1"); --> which skips the header line
hive> create external table emp(id int, name varchar(50),gender char(1), salary int, deptid int) row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");
hive> load data local inpath "/home/hadoop/Desktop/vow/emp.txt" into table emp;
hive> create external table dept(deptid int, deptname varchar(50)) row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");
hive> load data local inpath "/home/hadoop/Desktop/vow/dept.txt" into table dept;
scala> val empSchema = StructType(StructField("id",IntegerType,true)::StructField("name",StringType,true)::StructField("gender",StringType,true)::StructField("salary",IntegerType,true)::StructField("deptid",IntegerType,true)::Nil)
empSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(gender,StringType,true), StructField(salary,IntegerType,true), StructField(deptid,IntegerType,true))
scala> scala> val deptSchema = StructType(StructField("deptid",IntegerType,true)::StructField("deptname",StringType,true)::Nil)
deptSchema: org.apache.spark.sql.types.StructType = StructType(StructField(deptid,IntegerType,true), StructField(deptname,StringType,true))
val dfEmp = spark.read.format("csv").option("header","true").schema(empSchema).load("/home/hadoop/Desktop/vow/emp.txt");
val dfDept = spark.read.format("csv").option("header","true").schema(deptSchema).load("/home/hadoop/Desktop/vow/dept.txt");
hive> select * from emp;
OK
100 Ravi m 1000 10
101 Rani f 2000 11
102 Suresh m 3000 12
103 Rahul m 1250 10
104 Rashee f 3500 11
105 Priya f 3600 12
106 Ayeesha f 4000 10
107 Aruvi f 2500 11
108 Arushi f 2800 12
109 Vinay m 3200 10
110 Kalai f 2550 11
111 Shilpa f 2600 12
scala> dfEmp.show
or
scala> dfEmp.select("*").show
+---+-------+------+------+------+
| id| name|gender|salary|deptid|
+---+-------+------+------+------+
|100| Ravi| m| 1000| 10|
|101| Rani| f| 2000| 11|
|102| Suresh| m| 3000| 12|
|103| Rahul| m| 1250| 10|
|104| Rashee| f| 3500| 11|
|105| Priya| f| 3600| 12|
|106|Ayeesha| f| 4000| 10|
|107| Aruvi| f| 2500| 11|
|108| Arushi| f| 2800| 12|
|109| Vinay| m| 3200| 10|
|110| Kalai| f| 2550| 11|
|111| Shilpa| f| 2600| 12|
+---+-------+------+------+------+
hive> select * from dept;
OK
10 Marketing
11 Sales
12 Production
Time taken: 0.238 seconds, Fetched: 3 row(s)
scala> dfDept.show
or
scala> dfDept.select("*").show
+------+----------+
|deptid| deptname|
+------+----------+
| 10| Marketing|
| 11| Sales|
| 12|Production|
+------+----------+
scala> dfEmp.select(max($"salary") as "MaxSal").show
+------+
|MaxSal|
+------+
| 4000|
+------+
hive> select max(salary) from emp;
4000
scala> dfEmp.select(min($"salary") as "MaxSal").show
+------+
|MaxSal|
+------+
| 1000|
+------+
hive> select min(salary) from emp;
1000
hive> select max(salary) as MaxSal, min(salary) as MinSal from emp;
4000 1000
scala> dfEmp.select(max("salary") as "MaxSal",min("salary") as "MinSal").show
+------+------+
|MaxSal|MinSal|
+------+------+
| 4000| 1000|
+------+------+
hive> select deptid,max(salary) from emp group by deptid order by deptid;
10 4000
11 3500
12 3600
scala> dfEmp.groupBy("deptid").agg(max("salary") as "maxSal").orderBy("deptid").show
+------+------+
|deptid|maxSal|
+------+------+
| 10| 4000|
| 11| 3500|
| 12| 3600|
+------+------+
hive> select deptid,count(name) from emp group by deptid order by deptid;
10 4
11 4
12 4
scala> dfEmp.groupBy("deptid").agg(count("name") as "nameCount").orderBy("deptid").show
+------+---------+
|deptid|nameCount|
+------+---------+
| 10| 4|
| 11| 4|
| 12| 4|
+------+---------+
scala> dfEmp.select($"salary").orderBy("salary").show
or
scala> dfEmp.select(dfEmp("salary")).orderBy("salary").show
+------+
|salary|
+------+
| 1000|
| 1250|
| 2000|
| 2500|
| 2550|
| 2600|
| 2800|
| 3000|
| 3200|
| 3500|
| 3600|
| 4000|
+------+
select salary from emp order by salary
1000
1250
2000
2500
2550
2600
2800
3000
3200
3500
3600
4000
scala> dfEmp.select($"salary").orderBy(desc("salary")).show
or
dfEmp.select("salary").orderBy(desc("salary")).show
+------+
|salary|
+------+
| 4000|
| 3600|
| 3500|
| 3200|
| 3000|
| 2800|
| 2600|
| 2550|
| 2500|
| 2000|
| 1250|
| 1000|
+------+
hive> select salary from emp order by salary desc;
4000
3600
3500
3200
3000
2800
2600
2550
2500
2000
1250
1000
hive> select gender,max(salary) from emp group by gender order by gender;
f 4000
m 3200
scala> dfEmp.groupBy("gender").agg(max("salary") as "maxSal").orderBy("gender").show
+------+------+
|gender|maxSal|
+------+------+
| f| 4000|
| m| 3200|
+------+------+
hive> select gender,sum(salary) from emp group by gender order by gender;
f 23550
m 8450
scala> dfEmp.groupBy("gender").agg(sum("salary") as "GenderSumSal").orderBy("gender").show
+------+------------+
|gender|GenderSumSal|
+------+------------+
| f| 23550|
| m| 8450|
+------+------------+
hive> select * from emp order by salary desc;
OK
106 Ayeesha f 4000 10
105 Priya f 3600 12
104 Rashee f 3500 11
109 Vinay m 3200 10
102 Suresh m 3000 12
108 Arushi f 2800 12
111 Shilpa f 2600 12
110 Kalai f 2550 11
107 Aruvi f 2500 11
101 Rani f 2000 11
103 Rahul m 1250 10
100 Ravi m 1000 10
scala> dfEmp.orderBy(desc("salary")).show
+---+-------+------+------+------+
| id| name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha| f| 4000| 10|
|105| Priya| f| 3600| 12|
|104| Rashee| f| 3500| 11|
|109| Vinay| m| 3200| 10|
|102| Suresh| m| 3000| 12|
|108| Arushi| f| 2800| 12|
|111| Shilpa| f| 2600| 12|
|110| Kalai| f| 2550| 11|
|107| Aruvi| f| 2500| 11|
|101| Rani| f| 2000| 11|
|103| Rahul| m| 1250| 10|
|100| Ravi| m| 1000| 10|
+---+-------+------+------+------+
hive> select * from emp order by salary desc limit 2;
OK
106 Ayeesha f 4000 10
105 Priya f 3600 12
scala> dfEmp.orderBy(desc("salary")).show(2);
+---+-------+------+------+------+
| id| name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha| f| 4000| 10|
|105| Priya| f| 3600| 12|
+---+-------+------+------+------+
only showing top 2 rows
// top salaried person
hive> select * from emp order by salary desc limit 1;
106 Ayeesha f 4000 10
scala> dfEmp.orderBy(desc("salary")).show(1);
+---+-------+------+------+------+
| id| name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha| f| 4000| 10|
+---+-------+------+------+------+
only showing top 1 row
//extract single value (scalar) from dataframe
scala> val x:Int = dfEmp.agg(max("salary")).head().getInt(0)
x: Int = 4000
scala> dfEmp.orderBy(desc("salary")).show
+---+-------+------+------+------+
| id| name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha| f| 4000| 10|
|105| Priya| f| 3600| 12|
|104| Rashee| f| 3500| 11|
|109| Vinay| m| 3200| 10|
|102| Suresh| m| 3000| 12|
|108| Arushi| f| 2800| 12|
|111| Shilpa| f| 2600| 12|
|110| Kalai| f| 2550| 11|
|107| Aruvi| f| 2500| 11|
|101| Rani| f| 2000| 11|
|103| Rahul| m| 1250| 10|
|100| Ravi| m| 1000| 10|
+---+-------+------+------+------+
scala> dfEmp.where($"salary" < dfEmp.agg(max("salary")).first().getInt(0)).orderBy(desc("salary")).show(1)
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya| f| 3600| 12|
+---+-----+------+------+------+
only showing top 1 row
// 2nd maximum salaried person
hive> select * from emp where salary not in (select max(salary) from emp ) order by salary desc limit 1;
105 Priya f 3600 12
hive> select * from (select * from emp sort by salary desc limit 2) result sort by salary limit 1;
105 Priya f 3600 12
scala> dfEmp.orderBy(desc("Salary")).limit(2).orderBy("salary").show(1);
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya| f| 3600| 12|
+---+-----+------+------+------+
only showing top 1 row
scala> dfEmp.orderBy(desc("Salary")).take(2)
res87: Array[org.apache.spark.sql.Row] = Array([106,Ayeesha,f,4000,10], [105,Priya,f,3600,12])
scala> dfEmp.orderBy(desc("Salary")).take(2)(1);
res91: org.apache.spark.sql.Row = [105,Priya,f,3600,12]
Experimenting with ORC file - Parsing Yahoo Stocks using Spark with Scala
Input
yahoo_stocks.csv:
------------------
Date,Open,High,Low,Close,Volume,Adj Close
2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34
2015-04-27,44.65,45.10,44.25,44.36,10840900,44.36
2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52
2015-04-23,43.92,44.06,43.58,43.70,14274900,43.70
2015-04-22,44.58,44.85,43.67,43.98,32241200,43.98
2015-04-21,45.15,45.18,44.45,44.49,16103700,44.49
2015-04-20,44.73,44.91,44.41,44.66,10052900,44.66
2015-04-17,45.30,45.44,44.25,44.45,13305700,44.45
2015-04-16,45.82,46.13,45.53,45.78,13800300,45.78
2015-04-15,45.46,45.83,45.23,45.73,15033500,45.73
hadoop@hadoop:~/Desktop/vow$ hdfs dfs -copyFromLocal yahoo_stocks.csv /user/
hadoop@hadoop:~/Desktop/vow$ hdfs dfs -head /user/yahoo_stocks.csv
Date,Open,High,Low,Close,Volume,Adj Close
2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34
2015-04-27,44.65,45.10,44.25,44.36,10840900,44.36
2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52
2015-04-23,43.92,44.06,43.58,43.70,14274900,43.70
2015-04-22,44.58,44.85,43.67,43.98,32241200,43.98
2015-04-21,45.15,45.18,44.45,44.49,16103700,44.49
scala> import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLContext
scala> import org.apache.spark.sql.types
import org.apache.spark.sql.types
scala> import org.apache.spark.sql._
import org.apache.spark.sql._
scala> import spark.implicits._
import spark.implicits._
scala> spark.sql("CREATE TABLE yahoo_orc_table (date STRING, open_price FLOAT, high_price FLOAT, low_price FLOAT, close_price FLOAT, volume INT, adj_price FLOAT) stored as orc")
2019-02-01 20:59:04 WARN HiveMetaStore:1383 - Location: hdfs://localhost:9000/user/hive/warehouse/yahoo_orc_table specified for non-external table:yahoo_orc_table
res39: org.apache.spark.sql.DataFrame = []
scala> val stocks = sc.textFile("hdfs://localhost:9000/user/yahoo_stocks.csv")
stocks: org.apache.spark.rdd.RDD[String] = hdfs://localhost:9000/user/yahoo_stocks.csv MapPartitionsRDD[109] at textFile at :42
scala> stocks.take(5).foreach(println)
Date,Open,High,Low,Close,Volume,Adj Close
2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34
2015-04-27,44.65,45.10,44.25,44.36,10840900,44.36
2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52
2015-04-23,43.92,44.06,43.58,43.70,14274900,43.70
scala> val header = stocks.first
header: String = Date,Open,High,Low,Close,Volume,Adj Close
scala> val rddStocksWithoutHeader = stocks.filter(x => x != header)
rddStocksWithoutHeader: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[110] at filter at :45
scala> rddStocksWithoutHeader.take(5).foreach(println)
2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34
2015-04-27,44.65,45.10,44.25,44.36,10840900,44.36
2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52
2015-04-23,43.92,44.06,43.58,43.70,14274900,43.70
2015-04-22,44.58,44.85,43.67,43.98,32241200,43.98
scala> case class YahooStockPrice (date:String, open:Float, high:Float,low:Float,close:Float,volume:Integer,adjcClose:Float)
defined class YahooStockPrice
scala> val stockpricewithSchema = rddStocksWithoutHeader.map (x => {
| val fields = x.split(",")
| val date = fields(0)
| val open = fields(1).trim.toFloat
| val high = fields(2).trim.toFloat
| val low = fields(3).trim.toFloat
| val close = fields(4).trim.toFloat
| val volume = fields(5).trim.toInt
| val adjClose = fields(6).trim.toFloat
| YahooStockPrice(date,open,high,low,close,volume,adjClose)
| })
stockpricewithSchema: org.apache.spark.rdd.RDD[YahooStockPrice] = MapPartitionsRDD[111] at map at :45
scala> stockpricewithSchema.take(5).foreach(println)
YahooStockPrice(2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34)
YahooStockPrice(2015-04-27,44.65,45.1,44.25,44.36,10840900,44.36)
YahooStockPrice(2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52)
YahooStockPrice(2015-04-23,43.92,44.06,43.58,43.7,14274900,43.7)
YahooStockPrice(2015-04-22,44.58,44.85,43.67,43.98,32241200,43.98)
scala> val dfStockPrice = stockpricewithSchema.toDF
dfStockPrice: org.apache.spark.sql.DataFrame = [date: string, open: float ... 5 more fields]
scala> dfStockPrice.printSchema
root
|-- date: string (nullable = true)
|-- open: float (nullable = false)
|-- high: float (nullable = false)
|-- low: float (nullable = false)
|-- close: float (nullable = false)
|-- volume: integer (nullable = true)
|-- adjcClose: float (nullable = false)
scala> dfStockPrice.show(5)
+----------+-----+-----+-----+-----+--------+---------+
| date| open| high| low|close| volume|adjcClose|
+----------+-----+-----+-----+-----+--------+---------+
|2015-04-28|44.34|44.57|43.94|44.34| 7188300| 44.34|
|2015-04-27|44.65| 45.1|44.25|44.36|10840900| 44.36|
|2015-04-24|43.73|44.71|43.69|44.52|11267500| 44.52|
|2015-04-23|43.92|44.06|43.58| 43.7|14274900| 43.7|
|2015-04-22|44.58|44.85|43.67|43.98|32241200| 43.98|
+----------+-----+-----+-----+-----+--------+---------+
only showing top 5 rows
scala> dfStockPrice.createOrReplaceTempView("ystemp")
scala> val results = spark.sql("SELECT * FROM ystemp limit 10")
results: org.apache.spark.sql.DataFrame = [date: string, open: float ... 5 more fields]
scala> results.show
+----------+-----+-----+-----+-----+--------+---------+
| date| open| high| low|close| volume|adjcClose|
+----------+-----+-----+-----+-----+--------+---------+
|2015-04-28|44.34|44.57|43.94|44.34| 7188300| 44.34|
|2015-04-27|44.65| 45.1|44.25|44.36|10840900| 44.36|
|2015-04-24|43.73|44.71|43.69|44.52|11267500| 44.52|
|2015-04-23|43.92|44.06|43.58| 43.7|14274900| 43.7|
|2015-04-22|44.58|44.85|43.67|43.98|32241200| 43.98|
|2015-04-21|45.15|45.18|44.45|44.49|16103700| 44.49|
|2015-04-20|44.73|44.91|44.41|44.66|10052900| 44.66|
|2015-04-17| 45.3|45.44|44.25|44.45|13305700| 44.45|
|2015-04-16|45.82|46.13|45.53|45.78|13800300| 45.78|
|2015-04-15|45.46|45.83|45.23|45.73|15033500| 45.73|
+----------+-----+-----+-----+-----+--------+---------+
scala> results.map(t => "Stock Entry: " + t.toString).collect().foreach(println)
Stock Entry: [2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34]
Stock Entry: [2015-04-27,44.65,45.1,44.25,44.36,10840900,44.36]
Stock Entry: [2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52]
Stock Entry: [2015-04-23,43.92,44.06,43.58,43.7,14274900,43.7]
Stock Entry: [2015-04-22,44.58,44.85,43.67,43.98,32241200,43.98]
Stock Entry: [2015-04-21,45.15,45.18,44.45,44.49,16103700,44.49]
Stock Entry: [2015-04-20,44.73,44.91,44.41,44.66,10052900,44.66]
Stock Entry: [2015-04-17,45.3,45.44,44.25,44.45,13305700,44.45]
Stock Entry: [2015-04-16,45.82,46.13,45.53,45.78,13800300,45.78]
Stock Entry: [2015-04-15,45.46,45.83,45.23,45.73,15033500,45.73]
scala> val results = spark.sql("SELECT * FROM ystemp")
results: org.apache.spark.sql.DataFrame = [date: string, open: float ... 5 more fields]
scala> results.write.format("orc").save("yahoo_stocks_orc")
scala> val yahoo_stocks_orc = spark.read.format("orc").load("yahoo_stocks_orc")
yahoo_stocks_orc: org.apache.spark.sql.DataFrame = [date: string, open: float ... 5 more fields]
scala> yahoo_stocks_orc.createOrReplaceTempView("orcTest")
scala> spark.sql("SELECT * from orcTest").collect.take(3).foreach(println)
[2015-04-28,44.34,44.57,43.94,44.34,7188300,44.34]
[2015-04-27,44.65,45.1,44.25,44.36,10840900,44.36]
[2015-04-24,43.73,44.71,43.69,44.52,11267500,44.52]
Hive and Spark Integration
Integrating Hive with Spark
//Start Hive and create a new Database : School and Create a new Table : Student and add 3 records
hive> show databases;
OK
default
Time taken: 0.832 seconds, Fetched: 1 row(s)
hive> create database School ;
OK
Time taken: 0.343 seconds
hive> use School;
OK
Time taken: 0.045 seconds
hive> create table Student(id int, name varchar(50));
OK
Time taken: 0.685 seconds
hive> insert into Student (id,name) values(101,'Sankar');
insert into Student (id,name) values(102,"Zee");
insert into Student (id,name) values(103,"Maha");
hive> select * from Student;
OK
101 Sankar
102 Zee
103 Maha
Time taken: 0.261 seconds, Fetched: 3 row(s)
// Start Spark and do the following to access Hive Database (School), and Hive Table (Student)
scala> spark.sql("use School")
scala> spark.sql("select * from Student").show()
+---+------+
| id| name|
+---+------+
|101|Sankar|
|102| Zee|
|103| Maha|
+---+------+
How to fix metastore issue in Hive and MySQL?
run mysql...
sudo mysql
drop database metastore
$ schematool -dbType mysql -initSchema
$ schematool -dbType mysql -info
Spark RDD to Dataframe To Hive Table (Spark-SQL with Hive integration)
//Hide log messages, warning messages in spark-shell
import org.apache.log4j.Logger
import org.apache.log4j.Level
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
Input file
weblog.txt:
-------------
3.94.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.33.140.62 - 21475 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.31.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
3.91.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.32.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.35.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
3.93.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
13.94.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
11.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
scala> val rdd1 = sc.textFile("/home/hadoop/Desktop/weblog.txt")
rdd1: org.apache.spark.rdd.RDD[String] = /home/hadoop/Desktop/weblog.txt MapPartitionsRDD[50] at textFile at :29
scala> rdd1.count
res15: Long = 12
scala> rdd1.take(5).foreach(println)
3.94.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.33.140.62 - 21475 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.31.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
3.91.78.5 - 69827 [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.32.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
scala> val ip_Pattern = "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}".r
ip_Pattern: scala.util.matching.Regex = - [0-9]{1,5}
scala> val port_Pattern =" - [0-9]{1,5}".r
port_pattern: scala.util.matching.Regex = - [0-9]{1,5}
scala> val time_Pattern = "[0-9]{1,2}/[A-Z][a-z]{2}/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2} [+][0-9]{4}".r
time_Pattern: scala.util.matching.Regex = [0-9]{1,2}/[A-Z][a-z]{2}/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2} [+][0-9]{4}
scala> val rdd2 = rdd1.map { x =>
val ip = ip_Pattern.findFirstIn(x).get
val port = port_Pattern.findFirstIn(x).get
val port1 = port.slice(2,port.length).trim().toInt
val time = time_Pattern.findFirstIn(x).get
(ip,port1,time)
}
rdd2: org.apache.spark.rdd.RDD[(String, Int, String)] = MapPartitionsRDD[2] at map at :33
scala> rdd2.take(5).foreach(println)
(3.94.78.5,69827,15/Sep/2013:23:58:36 +0100)
(19.33.140.62,21475,15/Sep/2013:23:58:34 +0100)
(19.31.140.62,2489,15/Sep/2013:23:58:34 +0100)
(3.91.78.5,69827,15/Sep/2013:23:58:36 +0100)
(19.32.140.62,2489,15/Sep/2013:23:58:34 +0100)
scala> val df1 = rdd2.toDF("ipAddress","portNo","TimeStamp")
df1: org.apache.spark.sql.DataFrame = [ipAddress: string, portNo: int ... 1 more field]
scala> df1.show
+------------+------+--------------------+
| ipAddress|portNo| TimeStamp|
+------------+------+--------------------+
| 3.94.78.5| 69827|15/Sep/2013:23:58...|
|19.33.140.62| 21475|15/Sep/2013:23:58...|
|19.31.140.62| 2489|15/Sep/2013:23:58...|
| 3.91.78.5| 69827|15/Sep/2013:23:58...|
|19.32.140.62| 2489|15/Sep/2013:23:58...|
|19.35.140.62| 2489|15/Sep/2013:23:58...|
| 3.93.78.5| 69827|15/Sep/2013:23:58...|
|12.38.140.62| 2489|15/Sep/2013:23:58...|
|12.38.140.62| 4712|15/Sep/2013:23:58...|
| 13.94.78.5| 69827|15/Sep/2013:23:58...|
|11.38.140.62| 4712|15/Sep/2013:23:58...|
|12.38.140.62| 4712|15/Sep/2013:23:58...|
+------------+------+--------------------+
scala> val df2 = df1.select ("*").groupBy("ipAddress").agg(count("*"))
df2: org.apache.spark.sql.DataFrame = [ipAddress: string, count(1): bigint]
scala> df2.show
+------------+--------+
| ipAddress|count(1)|
+------------+--------+
|12.38.140.62| 3|
|19.31.140.62| 1|
| 3.94.78.5| 1|
| 3.93.78.5| 1|
|19.35.140.62| 1|
|11.38.140.62| 1|
|19.33.140.62| 1|
| 3.91.78.5| 1|
| 13.94.78.5| 1|
|19.32.140.62| 1|
+------------+--------+
scala> val df2 = df1.select ("*").groupBy("ipAddress").agg(count("*") as "count")
df2: org.apache.spark.sql.DataFrame = [ipAddress: string, count: bigint]
scala> df2.show
+------------+-----+
| ipAddress|count|
+------------+-----+
|12.38.140.62| 3|
|19.31.140.62| 1|
| 3.94.78.5| 1|
| 3.93.78.5| 1|
|19.35.140.62| 1|
|11.38.140.62| 1|
|19.33.140.62| 1|
| 3.91.78.5| 1|
| 13.94.78.5| 1|
|19.32.140.62| 1|
+------------+-----+
scala> df.write.saveAsTable("joy.outstanding")
// joy is the database which is present in Hive
scala> spark.sql("use joy");
res6: org.apache.spark.sql.DataFrame = []
// fetching Hive table info within Spark
scala> spark.sql("select * from outstanding").show
+------------+-----+
| ipAddress|count|
+------------+-----+
|12.38.140.62| 3|
|19.31.140.62| 1|
| 3.94.78.5| 1|
| 3.93.78.5| 1|
|19.35.140.62| 1|
|11.38.140.62| 1|
|19.33.140.62| 1|
| 3.91.78.5| 1|
| 13.94.78.5| 1|
|19.32.140.62| 1|
+------------+-----+
MySQL to Hive using SQOOP import-all-tables
[cloudera@quickstart ~]$ sqoop import-all-tables --num-mappers 1 --connect jdbc:mysql://localhost/world --username root --password cloudera --hive-import --hive-overwrite --create-hive-table --compress --compression-codec org.apache.hadoop.io.compress.SnappyCodec --outdir java_files;
chgrp: changing ownership of 'hdfs://quickstart.cloudera:8020/user/hive/warehouse/city': User does not belong to supergroup
chgrp: changing ownership of 'hdfs://quickstart.cloudera:8020/user/hive/warehouse/country': User does not belong to supergroup
Loading data to table default.city
Loading data to table default.country
hive> show tables;
OK
city
country
Time taken: 0.013 seconds, Fetched: 2 row(s)
hive> select * from city limit 10;
OK
1 Kabul AFG Kabol 1780000
2 Qandahar AFG Qandahar 237500
3 Herat AFG Herat 186800
4 Mazar-e-Sharif AFG Balkh 127800
5 Amsterdam NLD Noord-Holland 731200
6 Rotterdam NLD Zuid-Holland 593321
7 Haag NLD Zuid-Holland 440900
8 Utrecht NLD Utrecht 234323
9 Eindhoven NLD Noord-Brabant 201843
10 Tilburg NLD Noord-Brabant 193238
Time taken: 0.614 seconds, Fetched: 10 row(s)
describe formatted city;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/city
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/city
Found 2 items
-rwxrwxrwx 1 cloudera cloudera 0 2018-12-28 06:31 /user/hive/warehouse/city/_SUCCESS
-rwxrwxrwx 1 cloudera cloudera 93338 2018-12-28 06:31 /user/hive/warehouse/city/part-m-00000.snappy
Hive Notes - Part 3
Array:
collection of homogenous items
qual = ['BTech','MTech','PhD']
purpose of each element is same
prices = [23.32,54.23,34.23]
Struct:
Hetrogeneous collection items (Tuple in Pig)
info =
purpose of each element is different
Map:
collection of Key, Value pairs
qual =
name -> string
age -> int
sal -> double
city -> string
wife -> struct
qual -> array
exp -> map
[cloudera@quickstart ~]$ cat > file1.txt
100,200,300
200,500,900
11,12,12
1,2,3
hive> show databases;
OK
default
fbi_crime_db
practice
Time taken: 0.732 seconds, Fetched: 3 row(s)
hive> use practice;
OK
Time taken: 0.149 seconds
hive> show tables;
OK
couples
infos
profile1
profile2
profile3
qtab
res
strans
trans
tras
Time taken: 0.079 seconds, Fetched: 10 row(s)
hive> describe profile1;
OK
id int
name string
age int
qual array
city string
Time taken: 0.177 seconds, Fetched: 5 row(s)
hive> describe formatted profile1;
OK
# col_name data_type comment
id int
name string
age int
qual array
city string
# Detailed Table Information
Database: practice
Owner: cloudera
CreateTime: Fri Nov 16 19:24:59 PST 2018
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/practice.db/profile1
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE true
numFiles 1
totalSize 62
transient_lastDdlTime 1542425232
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
colelction.delim #
field.delim ,
serialization.format ,
Time taken: 0.104 seconds, Fetched: 35 row(s)
hive> describe extended profile1;
OK
id int
name string
age int
qual array
city string
Detailed Table Information Table(tableName:profile1, dbName:practice, owner:cloudera, createTime:1542425099, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:id, type:int, comment:null), FieldSchema(name:name, type:string, comment:null), FieldSchema(name:age, type:int, comment:null), FieldSchema(name:qual, type:array, comment:null), FieldSchema(name:city, type:string, comment:null)], location:hdfs://quickstart.cloudera:8020/user/hive/warehouse/practice.db/profile1, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{colelction.delim=#, serialization.format=,, field.delim=,}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{numFiles=1, transient_lastDdlTime=1542425232, COLUMN_STATS_ACCURATE=true, totalSize=62}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE)
Time taken: 0.091 seconds, Fetched: 7 row(s)
hive> create table info1(a int, b int, c int);
OK
Time taken: 0.213 seconds
hive> load data local inpath 'file1.txt' into table info1;
Loading data to table practice.info1
Table practice.info1 stats: [numFiles=1, totalSize=39]
OK
Time taken: 1.364 seconds
hive> select * from info1;
OK
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
Time taken: 0.458 seconds, Fetched: 4 row(s)
Delimiter:
Default delimiter for hive is Ctrl+A - '\001' - diamond symbol
But the file has zero \001s.
so entire line is treated as single column.
this entire line is a string which can't be fit into 1st column a which is int.
so 'a' became null.
as per '\001' delimiter, there are no 2nd and 3rd fields in the file.
thats why a,b,c columns became null,null,null.
solution.
change the delimiter of table.
hive> create table info2(a int, b int, c int) row format delimited fields terminated by ',';
OK
Time taken: 0.06 seconds
hive> load data local inpath 'file1.txt' into table info2;
Loading data to table practice.info2
Table practice.info2 stats: [numFiles=1, totalSize=39]
OK
Time taken: 0.24 seconds
hive> select * from info2;
OK
100 200 300
200 500 900
11 12 12
1 2 3
Time taken: 0.077 seconds, Fetched: 4 row(s)
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/info2
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 39 2018-11-18 18:52 /user/hive/warehouse/practice.db/info2/file1.txt
[cloudera@quickstart ~]$ hdfs dfs -cat /user/hive/warehouse/practice.db/info2/file1.txt
100,200,300
200,500,900
11,12,12
1,2,3
Array Example:
---------------
[cloudera@quickstart ~]$ cat > emp.txt
101,aaaa,40000,m,11
102,bbbb,44000,f,12
103,cccc,60000,m,12
hive> create table emp(id int, name string, sal int, sex string, dno int) row format delimited fields terminated by ',';
OK
Time taken: 0.067 seconds
hive> load data local inpath 'emp.txt' into table emp;
Loading data to table practice.emp
Table practice.emp stats: [numFiles=1, totalSize=60]
OK
Time taken: 0.198 seconds
hive> select * from emp;
OK
101 aaaa 40000 m 11
102 bbbb 44000 f 12
103 cccc 60000 m 12
Time taken: 0.11 seconds, Fetched: 3 row(s)
[cloudera@quickstart ~]$ cat > profile1.txt
101,Ravi,BTech#MTech#PhD,26,Hyd
102,Rani,Bsc#Msc#MTech,27,Del
hive> create table profile1 (id int, name string, qual array, age int,city string) row format delimited fields terminated by ',' collection items terminated by '#';
OK
Time taken: 0.09 seconds
Struct example:
---------------
[cloudera@quickstart ~]$ cat > profile2.txt
Ravi,26,Rani#24#Hyd,Del
Giri,24,Soni#23#Del,Hyd
hive> load data local inpath 'profile1.txt' into table profile1;
Loading data to table practice.profile1
Table practice.profile1 stats: [numFiles=1, totalSize=62]
OK
Time taken: 0.333 seconds
hive> select * from profile1;
OK
101 Ravi ["BTech","MTech","PhD"] 26 Hyd
102 Rani ["Bsc","Msc","MTech"] 27 Del
Time taken: 0.065 seconds, Fetched: 2 row(s)
hive> create table profile2(name string, age int, wife struct, city string) row format delimited fields terminated by ',' collection items terminated by '#';
OK
hive> select * from profile2;
OK
Ravi 26 {"name":"Rani","age":24,"city":"Hyd"} Del
Giri 24 {"name":"Soni","age":23,"city":"Del"} Hyd
Time taken: 0.049 seconds, Fetched: 2 row(s)
Time taken: 0.068 seconds
hive> load data local inpath 'profile2.txt' into table profile2;
Loading data to table practice.profile2
Table practice.profile2 stats: [numFiles=1, totalSize=48]
OK
Time taken: 0.242 seconds
Map example:
------------
[cloudera@quickstart ~]$ cat > profile3.txt
Hari,25,BTech$67#MTech$57#MBA$89,Hyd
Mani,26,Bsc$90#Msc$80#MBA$70,Del
hive> create table profile3(name string, age int, qual map,city string) row format delimited fields terminated by ',' collection items terminated by '#' map keys terminated by '$';
OK
Time taken: 0.085 seconds
hive> load data local inpath 'profile3.txt' into table profile3;
Loading data to table practice.profile3
Table practice.profile3 stats: [numFiles=1, totalSize=70]
OK
Time taken: 0.163 seconds
hive> select * from profile3;
OK
Hari 25 {"BTech":67,"MTech":57,"MBA":89} Hyd
Mani 26 {"Bsc":90,"Msc":80,"MBA":70} Del
Create a text file with the following contents:
-----------------------------------------------
[cloudera@quickstart ~]$ cat > prof1
101,Ravi,30,BTech#MTech#PhD,Hyd
102,Rani,25,Bsc#Msc#MTech,Del
create a database in hive:
---------------------------
hive> create database practice;
OK
Time taken: 0.183 seconds
Open a database:
-----------------
hive> use practice;
OK
create a table in hive:
-------------------------
hive> create table profile1 (id int, name string, age int, qual array,city string) row format delimited fields terminated by ',' collection items terminated by '#';
load data from text file:
-------------------------
load data local inpath 'prof1' into table profile1;
Get the structure of a table:
-----------------------------
hive> describe profile1;
OK
id int
name string
age int
qual array
city string
Time taken: 0.208 seconds, Fetched: 5 row(s)
Get more detailed description of a table:
-----------------------------------------
describe formatted profile1;
OK
# col_name data_type comment
id int
name string
age int
qual array
city string
# Detailed Table Information
Database: practice
Owner: cloudera
CreateTime: Fri Nov 16 19:24:59 PST 2018
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/practice.db/profile1
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE true
numFiles 1
totalSize 62
transient_lastDdlTime 1542425232
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
colelction.delim #
field.delim ,
serialization.format ,
Time taken: 0.12 seconds, Fetched: 35 row(s)
// This is internal table so, it is stored in : user/hive/warehouse/practice.db/profile1
hive> select * from profile1;
OK
101 Ravi 30 ["BTech","MTech","PhD"] Hyd
102 Rani 25 ["Bsc","Msc","MTech"] Del
Time taken: 0.654 seconds, Fetched: 2 row(s)
// retrieve first qualification
hive> select name,qual[0] from profile1;
OK
Ravi BTech
Rani Bsc
// size(qual) --> total number of elements...
hive> select name, size(qual) from profile1;
OK
Ravi 3
Rani 3
// total number of characters in their names
select length(name) from profile1;
OK
4
4
Size is applied on Array. Length is applied on string
get Qualifications:
-------------------
hive> select qual from profile1;
OK
["BTech","MTech","PhD"]
["Bsc","Msc","MTech"]
// How to flatten array elements?
//explode the given array to make flattened strings
hive> select explode(qual) from profile1;
OK
BTech
MTech
PhD
Bsc
Msc
MTech
create table qtab(qual string);
OK
Time taken: 0.068 seconds
hive> insert into table qtab select explode(qual) as q from profile1;
hive> select * from qtab;
OK
BTech
MTech
PhD
Bsc
Msc
MTech
MTech - how many people, BTech how many people???
// use the following to identify
select qual,count(*) from qtab group by qual;
BTech 1
Bsc 1
MTech 2
Msc 1
PhD 1
// create a result table:
hive> create table res(qual string, cnt int) row format delimited fields terminated by ',';
OK
Time taken: 0.161 seconds
// insert into select (select query result will be the input)
hive> insert into table res select qual,count(*) from qtab group by qual;
// see the result here
hive> select * from res;
OK
BTech 1
Bsc 1
MTech 2
Msc 1
PhD 1
Hive has 3 types of functions:
------------------------------
i) udf (user defined functions)
substr(),length(),size()
foreach row one value will be returned.
ii) udaf (user defined aggregated functions)
sum, count, min(),max(),avg()
for entire column or for entire group these functions return single value.
iii) udtf (user defined table generated functions)
explode(),json_tuple(),parse_url_tuple()
create sales text file:
-------------------------
[cloudera@quickstart ~]$ cat > sales
c101,10#20#100
c102,100#50
c103,100#50
c101,600#400
what is the bill for 101? (10+20+100+600+400) => 1130
create transaction table:
---------------------------
hive> create table trans (cid string, price array) row format delimited fields terminated by ',' collection items terminated by '#';
OK
Time taken: 0.093 seconds
load data from text file (sales) into transaction table:
-------------------------------------------------
hive>load data local inpath 'sales' into table trans;
// display the content of the table
hive> select * from trans;
OK
c101 [10,20,100]
c102 [100,50]
c103 [100,50]
c101 [600,400]
// clueless explode
hive> select explode(price) as pr from trans;
OK
10
20
100
100
50
100
50
600
400
Time taken: 0.056 seconds, Fetched: 9 row(s)
// along with udtf functions other columns are not allowed.
c101 [20,30] ==>
c101, 20
c101, 30
hive> select cid, mypr from trans lateral view explode(price) p as mypr;
OK
c101 10
c101 20
c101 100
c102 100
c102 50
c103 100
c103 50
c101 600
c101 400
hive> create table strans (cid string, price int);
hive> insert into table strans select cid, mypr from trans lateral view explode(price) p as mypr;
hive> select * from strans;
OK
c101 10
c101 20
c101 100
c102 100
c102 50
c103 100
c103 50
c101 600
c101 400
Cid Price
C101 [10,20]
C102 [30,20,40]
Explode
C101 10
C101 20
Union
Explode
C102 30
C102 20
C102 40
Result:
--------
C101 10
C101 20
C102 30
C102 20
C102 40
select cid,sum(price) from strans group by cid;
c101 1130
c102 150
c103 150
Profile:
--------
[cloudera@quickstart ~]$ cat > prof2
Ravi,30,Rani#25#Del,Hyd
Raghu,35,Sailu#23#Hyd,Del
hive> create table profile2(name string, age int, wife struct,city string);
OK
hive> create table profile2(name string, age int, wife struct, city string) row format delimited fields terminated by ',' collection items terminated by '#';
hive> load data local inpath 'prof2' into table profile2;
hive> select * from profile2;
OK
Ravi 30 {"name":"Rani","age":25,"city":"Del"} Hyd
Raghu 35 {"name":"Sailu","age":23,"city":"Hyd"} Del
hive> describe profile2;
OK
name string
age int
wife struct
city string
Time taken: 0.094 seconds, Fetched: 4 row(s)
hive> select name,age,wife.name,wife.age from profile2;
OK
Ravi 30 Rani 25
Raghu 35 Sailu 23
Time taken: 0.056 seconds, Fetched: 2 row(s)
hive> create table couples (hname string, wname string, hage int, wage int, hcity string, wcity string) row format delimited fields terminated by '\t';
OK
insert into table couples select name, wife.name, age, wife.age, city, wife.city from profile2;
hive> select * from couples;
OK
Ravi Rani 30 25 Hyd Del
Raghu Sailu 35 23 Del Hyd
[cloudera@quickstart ~]$ cat > prof3
Ravi,30,BTech$89#MTech$79#MBA$60,Hyd
Raghu,35,Bsc$80#Msc$70,Del
hive> create table profile3(name string, age int, qual map, city string) row format delimited fields terminated by ',' collection items terminated by '#' map keys terminated by '$';
OK
hive> load data local inpath 'prof3' into table profile3;
hive> load data local inpath 'prof3' overwrite into table profile3;
hive> select * from profile3;
OK
Ravi 30 {"BTech":89,"MTech":79,"MBA":60} Hyd
Raghu 35 {"Bsc":80,"Msc":70} Del
hive> select name, qual['BTech'] from profile3;
OK
Ravi 89
Raghu NULL
Time taken: 0.056 seconds, Fetched: 2 row(s)
hive> select name,qual["Msc"] from profile3;
OK
Ravi NULL
Raghu 70
Time taken: 0.046 seconds, Fetched: 2 row(s)
// get all values from the map
hive> select map_keys(qual) from profile3;
OK
["BTech","MTech","MBA"]
["Bsc","Msc"]
hive> select name, map_keys(qual),map_values(qual) from profile3;
OK
Ravi ["BTech","MTech","MBA"] [89,79,60]
Raghu ["Bsc","Msc"] [80,70]
hive> show databases;
OK
default
fbi_crime_db
practice
Time taken: 0.689 seconds, Fetched: 3 row(s)
hive> use practice;
OK
create table infos(name string, qual array,percentage array);
insert overwrite table infos select name, map_keys(qual), map_values(qual) from profile3;
hive> select * from infos;
OK
Ravi ["BTech","MTech","MBA"] [89,79,60]
Raghu ["Bsc","Msc"] [80,70]
hive> select name,myq from infos lateral view explode(qual) ql as myq;
OK
Ravi BTech
Ravi MTech
Ravi MBA
Raghu Bsc
Raghu Msc
Time taken: 0.064 seconds, Fetched: 5 row(s)
hive> select name,myp from infos lateral view explode(percentage) p as myp;
OK
Ravi 89
Ravi 79
Ravi 60
Raghu 80
Raghu 70
Time taken: 0.056 seconds, Fetched: 5 row(s)
Hive Table types:
Internal table
If internal table dropped both meta data and data will be dropped.
External table
Only meta data will be dropped.
Partitioned and non-partitioned tables:
By default each table is non-partitioned.
partition is a subdirectory in a table directory of hdfs.
Advantage of partition :
No need to scan all the table data.
[cloudera@quickstart ~]$ cat > emp
101,aaaa,40000,m,11
102,bbbb,50000,f,12
103,cccc,90000,m,12
104,dddd,10000,f,13
105,eeee,20000,m,11
[cloudera@quickstart ~]$ cat > emp2
201,ee,80000,f,12
202,xx,90000,m,13
[cloudera@quickstart ~]$ cat > emp3
301,aaaaa,900000,m,11
302,iop,100000,f,12
303,hhg,200000,m,11
304,ghgh,300000,f,13
hive> create table emp(id int, name string, sal int, sex string, dno int) row format delimited fields terminated by ',';
OK
hive> load data local inpath 'emp' into table emp;
Loading data to table practice.emp
Table practice.emp stats: [numFiles=1, totalSize=100]
OK
hive> load data local inpath 'emp2' into table emp;
Loading data to table practice.emp
Table practice.emp stats: [numFiles=1, totalSize=100]
OK
hive> load data local inpath 'emp3' into table emp;
Loading data to table practice.emp
Table practice.emp stats: [numFiles=1, totalSize=100]
OK
hive> select * from emp;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
201 ee 80000 f 12
202 xx 90000 m 13
301 aaaaa 900000 m 11
302 iop 100000 f 12
303 hhg 200000 m 11
304 ghgh 300000 f 13
Time taken: 0.066 seconds, Fetched: 11 row(s)
hive> describe formatted emp;
OK
# col_name data_type comment
id int
name string
sal int
sex string
dno int
# Detailed Table Information
Database: practice
Owner: cloudera
CreateTime: Sun Nov 18 22:05:00 PST 2018
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/practice.db/emp
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE true
numFiles 3
totalSize 219
transient_lastDdlTime 1542607585
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim ,
serialization.format ,
Time taken: 0.054 seconds, Fetched: 34 row(s)
//Non partitioned table so no subdirectories. just table directory only...
hdfs dfs -ls /user/hive/warehouse/practice.db/emp
Found 3 items
-rwxrwxrwx 1 cloudera supergroup 100 2018-11-18 22:06 /user/hive/warehouse/practice.db/emp/emp
-rwxrwxrwx 1 cloudera supergroup 36 2018-11-18 22:06 /user/hive/warehouse/practice.db/emp/emp2
-rwxrwxrwx 1 cloudera supergroup 83 2018-11-18 22:06 /user/hive/warehouse/practice.db/emp/emp3
Partitions are not created when you create a table.
Partitions are created when you load data.
create table epart(id int, name string, sal int, sex string, dno int) partitioned by (s string);
hive> insert overwrite table epart partition (s = 'f') select * from emp where sex = 'f';
hive> insert overwrite table epart partition (s = 'm') select * from emp where sex = 'm';
hive> select * from epart;
OK
102 bbbb 50000 f 12 f
104 dddd 10000 f 13 f
201 ee 80000 f 12 f
302 iop 100000 f 12 f
304 ghgh 300000 f 13 f
101 aaaa 40000 m 11 m
103 cccc 90000 m 12 m
105 eeee 20000 m 11 m
202 xx 90000 m 13 m
301 aaaaa 900000 m 11 m
303 hhg 200000 m 11 m
Time taken: 0.172 seconds, Fetched: 11 row(s)
hive> describe epart;
OK
id int
name string
sal int
sex string
dno int
s string
# Partition Information
# col_name data_type comment
s string
Time taken: 0.191 seconds, Fetched: 11 row(s)
hive> describe formatted epart;
OK
# col_name data_type comment
id int
name string
sal int
sex string
dno int
# Partition Information
# col_name data_type comment
s string
# Detailed Table Information
Database: practice
Owner: cloudera
CreateTime: Sun Nov 18 22:22:37 PST 2018
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/practice.db/epart
Table Type: MANAGED_TABLE
Table Parameters:
numPartitions 2
transient_lastDdlTime 1542608557
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
Time taken: 0.106 seconds, Fetched: 36 row(s)
hdfs dfs -ls /user/hive/warehouse/practice.db/epart
Found 2 items
drwxrwxrwx - cloudera supergroup 0 2018-11-18 22:24 /user/hive/warehouse/practice.db/epart/s=f
drwxrwxrwx - cloudera supergroup 0 2018-11-18 22:30 /user/hive/warehouse/practice.db/epart/s=m
hdfs dfs -ls /user/hive/warehouse/practice.db/epart/s=f
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 99 2018-11-18 22:24 /user/hive/warehouse/practice.db/epart/s=f/000000_0
hdfs dfs -cat /user/hive/warehouse/practice.db/epart/s=f/000000_0
102 bbbb 50000 f 12
104 dddd 10000 f 13
201 ee 80000 f 12
302 iop 100000 f 12
304 ghgh 300000 f 13
hdfs dfs -cat /user/hive/warehouse/practice.db/epart/s=m/000000_0
101 aaaa 40000 m 11
103 cccc 90000 m 12
105 eeee 20000 m 11
202 xx 90000 m 13
301 aaaaa 900000 m 11
303 hhg 200000 m 11
/user/hive/warehouse/practice.db -- database name as directory
/epart -- table name as directory
/s=m -- partition name as sub directory
/000000_0 -- data will be here as file
/user/hive/warehouse/practice.db -- database name as directory
/epart -- table name as directory
/s=f -- partition name as sub directory
/000000_0 -- data will be here as file
hive> select * from epart;
OK
102 bbbb 50000 f 12 f
104 dddd 10000 f 13 f
201 ee 80000 f 12 f
302 iop 100000 f 12 f
304 ghgh 300000 f 13 f
101 aaaa 40000 m 11 m
103 cccc 90000 m 12 m
105 eeee 20000 m 11 m
202 xx 90000 m 13 m
301 aaaaa 900000 m 11 m
303 hhg 200000 m 11 m
Time taken: 0.074 seconds, Fetched: 11 row(s)
hive> select * from epart where s ='f';
OK
102 bbbb 50000 f 12 f
104 dddd 10000 f 13 f
201 ee 80000 f 12 f
302 iop 100000 f 12 f
304 ghgh 300000 f 13 f
Time taken: 0.058 seconds, Fetched: 5 row(s)
hive> select * from epart where s ='m';
OK
101 aaaa 40000 m 11 m
103 cccc 90000 m 12 m
105 eeee 20000 m 11 m
202 xx 90000 m 13 m
301 aaaaa 900000 m 11 m
303 hhg 200000 m 11 m
Time taken: 0.073 seconds, Fetched: 6 row(s)
hive>
We should request partioned column based condition to get the advantage of partition.
// filter condition based on non-partitioned column - no advantage
hive> select * from epart where sex ='m';
OK
101 aaaa 40000 m 11 m
103 cccc 90000 m 12 m
105 eeee 20000 m 11 m
202 xx 90000 m 13 m
301 aaaaa 900000 m 11 m
303 hhg 200000 m 11 m
Time taken: 0.074 seconds, Fetched: 6 row(s)
hive> select * from epart where sex ='f';
OK
102 bbbb 50000 f 12 f
104 dddd 10000 f 13 f
201 ee 80000 f 12 f
302 iop 100000 f 12 f
304 ghgh 300000 f 13 f
Time taken: 0.076 seconds, Fetched: 5 row(s)
We are gonig to make Department number based partition now.
hive> create table eparts(id int, name string, sal int, sex string, dno int) partitioned by (d int) row format delimited fields terminated by ',';
// static partition - becauase we need to run the following queries one by one manually (statically)
hive> insert overwrite table eparts partition (d = 11) select * from emp where dno = 11;
hive> insert overwrite table eparts partition (d = 12) select * from emp where dno = 12;
hive> insert overwrite table eparts partition (d = 13) select * from emp where dno = 13;
hive> insert overwrite table eparts partition (d = 14) select * from emp where dno = 14;
hive> insert overwrite table eparts partition (d = 15) select * from emp where dno = 15;
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/eparts
Found 5 items
drwxrwxrwx - cloudera supergroup 0 2018-11-18 22:50 /user/hive/warehouse/practice.db/eparts/d=11
drwxrwxrwx - cloudera supergroup 0 2018-11-18 22:51 /user/hive/warehouse/practice.db/eparts/d=12
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:04 /user/hive/warehouse/practice.db/eparts/d=13
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:04 /user/hive/warehouse/practice.db/eparts/d=14
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:05 /user/hive/warehouse/practice.db/eparts/d=15
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/eparts/d=11
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 82 2018-11-18 22:50 /user/hive/warehouse/practice.db/eparts/d=11/000000_0
[cloudera@quickstart ~]$ hdfs dfs -cat /user/hive/warehouse/practice.db/eparts/d=11/000000_0
101,aaaa,40000,m,11
105,eeee,20000,m,11
301,aaaaa,900000,m,11
303,hhg,200000,m,11
[cloudera@quickstart ~]$ hdfs dfs -cat /user/hive/warehouse/practice.db/eparts/d=12/000000_0
102,bbbb,50000,f,12
103,cccc,90000,m,12
201,ee,80000,f,12
302,iop,100000,f,12
[cloudera@quickstart ~]$ hdfs dfs -cat /user/hive/warehouse/practice.db/eparts/d=13/000000_0
104,dddd,10000,f,13
202,xx,90000,m,13
304,ghgh,300000,f,13
// Initially we created partition based on sex ='m' or sex ='f' (s='m' or s='f')
// then we created partition based on dno (dno = 11, dno=12..)
we can also create partition using multiple columns together.
//partition based on multiple columns...
hive> create table mpart (id int, name string, sal int, sex string, dno int) partitioned by(d int, s string);
insert overwrite table mpart partition(d=11, s='f') select * from emp where dno=11 and sex ='f';
insert overwrite table mpart partition(d=11, s='m') select * from emp where dno=11 and sex ='m';
insert overwrite table mpart partition(d=12, s='f') select * from emp where dno=12 and sex ='f';
insert overwrite table mpart partition(d=12, s='m') select * from emp where dno=12 and sex ='m';
insert overwrite table mpart partition(d=13, s='f') select * from emp where dno=13 and sex ='f';
insert overwrite table mpart partition(d=13, s='m') select * from emp where dno=13 and sex ='m';
hdfs dfs -ls /user/hive/warehouse/practice.db/mpart
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:18 /user/hive/warehouse/practice.db/mpart/d=11
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/mpart/d=11
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:18 /user/hive/warehouse/practice.db/mpart/d=11/s=f
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/mpart/d=11/s=f
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 0 2018-11-18 23:18 /user/hive/warehouse/practice.db/mpart/d=11/s=f/000000_0
[cloudera@quickstart ~]$ hdfs dfs -cat /user/hive/warehouse/practice.db/mpart/d=11/s=f/000000_0
hdfs dfs -ls /user/hive/warehouse/practice.db/mpart/d=11
Found 2 items
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:18 /user/hive/warehouse/practice.db/mpart/d=11/s=f
drwxrwxrwx - cloudera supergroup 0 2018-11-18 23:22 /user/hive/warehouse/practice.db/mpart/d=11/s=m
hdfs dfs -cat /user/hive/warehouse/practice.db/mpart/d=11/s=m/000000_0
101 aaaa 40000 m 11
105 eeee 20000 m 11
301 aaaaa 900000 m 11
303 hhg 200000 m 11
/user/hive/warehouse/practice.db/mpart
/d=11
/s=f/000000_0
/s=m/000000_0
/d=12
/s=f/000000_0
/s=m/000000_0
/d=13
/s=f/000000_0
/s=m/000000_0
Dynamic partitions with multiple columns:
-------------------------------------------
create table dpart(id int, name string, sal int, sex string, dno int) partitioned by (d int, s string) row format delimited fields terminated by ',';
settings to enable dynamic partitions:
---------------------------------------
hive> set hive.exec.dynamic.partition = true;
hive> set hive.exec.dynamic.partition.mode = nonstrict;
insert overwrite table dpart partition(d,s) select id,name,sal,sex,dno,dno,sex from emp;
// look here -- dynamic partitions created
hdfs dfs -ls /user/hive/warehouse/practice.db/dpart
Found 3 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=11
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=12
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=13
/// based on the input data the following sub folders created dynamically (on the fly)
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/dpart/d=11
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=11/s=m
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/dpart/d=12
Found 2 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=12/s=f
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=12/s=m
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/dpart/d=13
Found 2 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=13/s=f
drwxrwxrwx - cloudera supergroup 0 2018-11-19 00:34 /user/hive/warehouse/practice.db/dpart/d=13/s=m
// without scanning all the records it directly fetch from d=13 and s=f folders
hive> select * from dpart where d = 13 and s='f';
OK
104 dddd 10000 f 13 13 f
304 ghgh 300000 f 13 13 f
Time taken: 0.178 seconds, Fetched: 2 row(s)
sample data:
-------------
gedit sales.txt
----------------
01/01/2011,45000
02/01/2011,46000
03/01/2011,45500
04/01/2011,46050
05/01/2011,45005
06/01/2011,46010
07/01/2011,45200
08/01/2011,43000
09/01/2011,48000
10/01/2011,42001
11/01/2011,45002
12/01/2011,46003
13/01/2011,45504
14/01/2011,46055
15/01/2011,45006
16/01/2011,46017
17/01/2011,45208
18/01/2011,43009
19/01/2011,48010
20/01/2011,42011
01/01/2011,45000
02/01/2011,46000
03/01/2011,45500
04/01/2011,46050
05/01/2011,45005
06/01/2011,46010
07/01/2011,45200
08/01/2011,43000
09/01/2011,48000
10/01/2011,42001
11/01/2011,45002
12/01/2011,46003
13/01/2011,45504
14/01/2011,46055
15/01/2011,45006
16/01/2011,46017
17/01/2011,45208
18/01/2011,43009
19/01/2011,48010
20/01/2011,42011
01/01/2011,45000
02/01/2011,46000
03/01/2011,45500
04/01/2011,46050
05/01/2011,45005
06/01/2011,46010
07/01/2011,45200
08/01/2011,43000
09/01/2011,48000
10/01/2011,42001
11/01/2011,45002
12/01/2011,46003
13/01/2011,45504
14/01/2011,46055
15/01/2011,45006
16/01/2011,46017
17/01/2011,45208
18/01/2011,43009
19/01/2011,48010
20/01/2011,42011
hive> create table raw (dt string, amt int) row format delimited fields terminated by ',';
OK
Time taken: 0.073 seconds
hive> load data local inpath 'sales.txt' into table raw;
Loading data to table practice.raw
Table practice.raw stats: [numFiles=1, totalSize=52]
OK
Time taken: 0.182 second
hive> select * from raw;
OK
01/01/2011 45000
02/01/2011 46000
03/01/2011 45500
04/01/2011 46050
05/01/2011 45005
06/01/2011 46010
07/01/2011 45200
08/01/2011 43000
09/01/2011 48000
10/01/2011 42001
11/01/2011 45002
12/01/2011 46003
13/01/2011 45504
14/01/2011 46055
15/01/2011 45006
16/01/2011 46017
17/01/2011 45208
18/01/2011 43009
19/01/2011 48010
20/01/2011 42011
Time taken: 0.047 seconds, Fetched: 20 row(s)
RDBMS date format is : 20/01/2011 (MM/DD/YYYY)
but Hive Date Format is : 2017-01-23 (YYYY-MM-DD)
hive> create table raw2 (dt array, amt int);
OK
Time taken: 0.054 seconds
hive> insert into table raw2 select split(dt,'/'), amt from raw;
hive> select split(dt,'/'), amt from raw;
OK
["01","01","2011"] 45000
["02","01","2011"] 46000
hive> select * from raw2;
OK
["01","01","2011"] 45000
["02","01","2011"] 46000
hive> select concat(dt[2],'-',dt[0],'-',dt[1]) from raw2;
OK
2011-01-01
2011-02-01
2011-03-01
hive> create table rawx like raw;
// make duplicate rows for 2012, 2013...
insert into table rawx select dt, amt from raw union all select concat(substr(dt,1,9),'2') as dt, amt+2000 as amt from raw union all select concat(substr(dt,1,9),'3') as dt, amt+10000 as amt from raw;
//rawx has 3 years data
hive> select * from rawx;
OK
01/01/2011 45000
01/01/2012 47000
01/01/2013 55000
02/01/2011 46000
hive> insert overwrite table raw2 select split(dt,'/'), amt from rawx;
hive> set hive.exec.dynamic.partition = true;
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> set hive.exec.max.dynamic.partitions.pernode = 100000;
hive> create table sales(dt string, amt int);
OK
Time taken: 0.068 seconds
hive> insert into table sales select concat(dt[2],'-',dt[0],'-',dt[1]),amt from raw2;
hive> select * from sales limit 3;
OK
2011-01-01 45000
2012-01-01 47000
2013-01-01 55000
Time taken: 0.058 seconds, Fetched: 3 row(s)
hive> create table spart (dt string, amt int) partitioned by (y int, m int, d int) row format delimited fields terminated by ',';
OK
hive> insert overwrite table spart partition (y,m,d) select dt, amt, year(dt),month(dt),day(dt) from sales;
hdfs dfs -ls /user/hive/warehouse/practice.db/spart
Found 4 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2012
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2013
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2014
ound 4 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2012
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2013
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2014
[cloudera@quickstart ~]$
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/spart
Found 4 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2012
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2013
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2014
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/spart/y=2011
Found 12 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=1
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=10
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=11
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=12
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=2
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=3
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=4
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=5
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=6
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=7
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=8
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=9
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/spart/y=2011/m=1
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=1/d=1
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/practice.db/spart/y=2011/m=11
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-19 01:30 /user/hive/warehouse/practice.db/spart/y=2011/m=11/d=1
[cloudera@quickstart ~]$
Static:
partition (y=1999,M=12,D=30) -- static
dynamic:
partition (y,m,d)
select * from sales where dt = '2017-08-07' -- it will scan all the records.. bad performance
hive> select * from spart where m = 1 and d = 1;
OK
2011-01-01 45000 2011 1 1
2011-01-01 45000 2011 1 1
2011-01-01 45000 2011 1 1
2012-01-01 47000 2012 1 1
2011-13-01 45504 2012 1 1
2012-01-01 47000 2012 1 1
2011-13-01 45504 2012 1 1
2012-01-01 47000 2012 1 1
2011-13-01 45504 2012 1 1
2013-01-01 55000 2013 1 1
2012-13-01 47504 2013 1 1
2013-01-01 55000 2013 1 1
2012-13-01 47504 2013 1 1
2013-01-01 55000 2013 1 1
2012-13-01 47504 2013 1 1
2013-13-01 55504 2014 1 1
2013-13-01 55504 2014 1 1
2013-13-01 55504 2014 1 1
Time taken: 0.051 seconds, Fetched: 18 row(s)
hive> select * from spart where y = 2011 and m = 1 and d = 1;
OK
2011-01-01 45000 2011 1 1
2011-01-01 45000 2011 1 1
2011-01-01 45000 2011 1 1
Time taken: 0.052 seconds, Fetched: 3 row(s)
hive>
xml data processing using Hive:
sample xml data:
Ravi
Rani
f
if the above file is loaded into hive table, there will be 8 records ( rows)
we need to transform each vertical xml recor into horizantal xml record.
[using MapReduce / Spark ]
After transformation:
---------------------
Ranif
$ cat > myxml.xml
Ranif
[cloudera@quickstart ~]$ cat myxml.xml
Ranif
Now records are converted from vertical into horizantal
xml is semi-structured
hive> create database xmls;
OK
Time taken: 0.211 seconds
hive> use xmls;
OK
Time taken: 0.008 seconds
hive> create table raw (line string);
OK
Time taken: 0.061 seconds
hive> load data local inpath 'myxml.xml' into table raw;
Loading data to table xmls.raw
Table xmls.raw stats: [numFiles=1, totalSize=83]
OK
Time taken: 0.123 seconds
hive> select * from raw;
OK
Ranif
Time taken: 0.028 seconds, Fetched: 2 row(s)
hive> select xpath_string(line,'rec/name') from raw;
OK
Ravi
Rani
Time taken: 0.035 seconds, Fetched: 2 row(s)
hive> select xpath_int(line,'rec/age') from raw;
OK
25
0
Time taken: 0.035 seconds, Fetched: 2 row(s)
if the string field is missing, you will get space
if the numeric field is missing, you will get 0.
ive> insert into table info select xpath_string(line,'rec/name'), xpath_int(line,'rec/age'), xpath_string(line,'rec/sex') from raw;
hive> select * from info;
OK
Ravi 25
Rani 0 f
Time taken: 0.035 seconds, Fetched: 2 row(s)
cat > xml2
SiriVeni23siri@gmail.comsiri@ibm.com988617737598861773759886177375Hyderabad
hive> create table xraw (line string);
OK
Time taken: 0.03 seconds
hive> load data local inpath 'xml2' into table xraw;
Loading data to table xmls.xraw
Table xmls.xraw stats: [numFiles=1, totalSize=302]
OK
Time taken: 0.126 seconds
hive> select * from xraw;
OK
SiriVeni23siri@gmail.comsiri@ibm.com988617737598861773759886177375Hyderabad
Time taken: 0.031 seconds, Fetched: 1 row(s)
hive> select xpath_string(line,'rec/name/fname'), xpath_string(line,'rec/name/lname'),xpath_int(line,'rec/age'), xpath_string(line,'rec/contact/email/personal'),xpath_string(line,'rec/contact/email/official'),xpath_string(line,'rec/contact/phone/mobile'),xpath_string(line,'rec/contact/phone/office'),xpath_string(line,'rec/contact/phone/residence'),xpath_string(line,'rec/city') from xraw;
OK
Siri Veni 23 siri@gmail.com siri@ibm.com 9886177375 9886177375 9886177375 Hyderabad
Time taken: 0.058 seconds, Fetched: 1 row(s)
hive> create table xinfo(fname string, lname string, age int, personal_email string, official_email string, mobile string, office_phone string, residence_phone string, city string);
OK
Time taken: 0.033 seconds
hive>
hive> insert into table xinfo select xpath_string(line,'rec/name/fname'), xpath_string(line,'rec/name/lname'),xpath_int(line,'rec/age'), xpath_string(line,'rec/contact/email/personal'),xpath_string(line,'rec/contact/email/official'),xpath_string(line,'rec/contact/phone/mobile'),xpath_string(line,'rec/contact/phone/office'),xpath_string(line,'rec/contact/phone/residence'),xpath_string(line,'rec/city') from xraw;
hive> select * from xraw;
OK
SiriVeni23siri@gmail.comsiri@ibm.com988617737598861773759886177375Hyderabad
Time taken: 0.602 seconds, Fetched: 1 row(s)
hive> select * from xinfo;
OK
Siri Veni 23 siri@gmail.com siri@ibm.com 9886177375 9886177375 9886177375 Hyderabad
Time taken: 0.038 seconds, Fetched: 1 row(s)
hive>
cat > xml3.xml
RaviBtechMTech
GiriBtechMTechPhD
VeniBscMscMTech
hive> use xmls;
OK
Time taken: 0.011 seconds
hive> create table xmlraw(line string);
OK
Time taken: 0.121 seconds
hive> load data local inpath 'xml3.xml' into table xmlraw;
Loading data to table xmls.xmlraw
Table xmls.xmlraw stats: [numFiles=1, totalSize=226]
OK
Time taken: 0.432 seconds
hive> select * from xmlraw;
OK
RaviBtechMTech
GiriBtechMTechPhD
VeniBscMscMTech
Time taken: 0.061 seconds, Fetched: 3 row(s)
hive>
hive> select xpath_string(line,'rec/name') from xmlraw;
OK
Ravi
Giri
Veni
Time taken: 0.085 seconds, Fetched: 3 row(s)
// Here it fetches only the 1st qualification.
hive> select xpath_string(line,'rec/qual') from xmlraw;
OK
Btech
Btech
Bsc
Time taken: 0.055 seconds, Fetched: 3 row(s)
hive> select xpath(line,'rec/qual/text()') from xmlraw;
OK
["Btech","MTech"]
["Btech","MTech","PhD"]
["Bsc","Msc","MTech"]
Time taken: 0.044 seconds, Fetched: 3 row(s)
hive> insert overwrite table raw2 select xpath_string(line,'rec/name'),xpath(line,'rec/qual/text()') from xmlraw;
hive> select * from raw2;
OK
Ravi ["Btech","MTech"]
Giri ["Btech","MTech","PhD"]
Veni ["Bsc","Msc","MTech"]
hive> select name,size(qual) from raw2;
OK
Ravi 2
Giri 3
Veni 3
Time taken: 0.039 seconds, Fetched: 3 row(s)
hive> select * from raw2 where array_contains(qual,'Bsc');
OK
Veni ["Bsc","Msc","MTech"]
Time taken: 0.047 seconds, Fetched: 1 row(s)
hive> select * from raw2;
OK
Ravi ["Btech","MTech"]
Giri ["Btech","MTech","PhD"]
Veni ["Bsc","Msc","MTech"]
Time taken: 0.067 seconds, Fetched: 3 row(s)
hive> select * from raw2 where array_contains(qual,'Btech');
OK
Ravi ["Btech","MTech"]
Giri ["Btech","MTech","PhD"]
Time taken: 0.048 seconds, Fetched: 2 row(s)
hive> select explode(qual) as q from raw2;
OK
Btech
MTech
Btech
MTech
PhD
Bsc
Msc
MTech
Time taken: 0.04 seconds, Fetched: 8 row(s)
hive> select name, myq from raw2 lateral view explode(qual) q as myq;
OK
Ravi Btech
Ravi MTech
Giri Btech
Giri MTech
Giri PhD
Veni Bsc
Veni Msc
Veni MTech
Time taken: 0.045 seconds, Fetched: 8 row(s)
cat xml4.xml
hive> create table sraw(line string);
OK
Time taken: 0.052 seconds
hive> load data local inpath 'xml4.xml' into table sraw;
Loading data to table xmls.sraw
Table xmls.sraw stats: [numFiles=1, totalSize=150]
OK
Time taken: 0.106 seconds
hive> select * from sraw;
OK
Time taken: 0.041 seconds, Fetched: 3 row(s)
hive> select xpath_int(line,'tr/cid') from sraw;
OK
101
102
101
Time taken: 0.044 seconds, Fetched: 3 row(s)
hive> select xpath(line,'tr/pr/text()') from sraw;
OK
["1000","2000","4000"]
["3000","5000"]
["5000"]
Time taken: 0.045 seconds, Fetched: 3 row(s)
hive> create table sraw2(cid int, pr array);
OK
Time taken: 0.035 seconds
hive> insert into table sraw2 select xpath_int(line,'tr/cid'),xpath(line,'tr/pr/text()') from sraw;
hive> select * from sraw2;
OK
101 ["1000","2000","4000"]
102 ["3000","5000"]
101 ["5000"]
Time taken: 0.04 seconds, Fetched: 3 row(s)
hive> select * from sraw;
OK
Time taken: 0.041 seconds, Fetched: 3 row(s)
hive> select * from sraw2;
OK
101 ["1000","2000","4000"]
102 ["3000","5000"]
101 ["5000"]
Time taken: 0.033 seconds, Fetched: 3 row(s)
hive> select explode(pr) from sraw2;
OK
1000
2000
4000
3000
5000
5000
Time taken: 0.041 seconds, Fetched: 6 row(s)
hive> create table sales(cid int, pr int) row format delimited fields terminated by ',';
OK
Time taken: 0.052 seconds
hive> insert into table sales select cid, mypr from sraw2 lateral view explode(pr) p as mypr;
hive> select * from sales;
OK
101 1000
101 2000
101 4000
102 3000
102 5000
101 5000
Time taken: 0.037 seconds, Fetched: 6 row(s)
hive> create table results (cid int, tot int);
OK
Time taken: 0.039 seconds
hive> insert into table results select cid, sum(pr) from sales group by cid;
hive> select * from results;
OK
101 12000
102 8000
Time taken: 0.045 seconds, Fetched: 2 row(s)
cat xml5.xml
hive> create table xxraw(line string);
OK
Time taken: 0.047 seconds
hive> load data local inpath 'xml5.xml' into table xxraw;
Loading data to table xmls.xxraw
Table xmls.xxraw stats: [numFiles=1, totalSize=145]
OK
Time taken: 0.115 seconds
hive> select * from xxraw;
OK
Time taken: 0.035 seconds, Fetched: 2 row(s)
hive> select xpath_int(line,'tr/cid') from xxraw;
OK
101
102
Time taken: 0.044 seconds, Fetched: 2 row(s)
hive> select xpath(line,'tr/cid/text()') from xxraw;
OK
["101"]
["102"]
Time taken: 0.056 seconds, Fetched: 2 row(s)
hive> select xpath(line,'tr/pr/text()') from xxraw;
OK
["1000","2000","4000"]
["3000","5000"]
Time taken: 0.046 seconds, Fetched: 2 row(s)
hive> select xpath(line,'tr/pr/@id') from xxraw;
OK
["p1","p7"]
["p7","p2"]
Time taken: 0.04 seconds, Fetched: 2 row(s)
hive> select xpath_int(line,'tr/cid'), xpath(line,'tr/pr/@id'),xpath(line,'tr/pr/text()') from xxraw;
OK
101 ["p1","p7"] ["1000","2000","4000"]
102 ["p7","p2"] ["3000","5000"]
Time taken: 0.087 seconds, Fetched: 2 row(s)
JSON integration with Hive:
---------------------------
cat json1.json
{"name":"Ravi","age":25}
{"name":"Rani","city":"Hyderabad"}
{"name":"Mani","age":24,"city":"Delhi"}
hive> create database jsons;
OK
Time taken: 0.068 seconds
hive> use jsons;
OK
Time taken: 0.007 seconds
hive> create table raw(line string);
OK
Time taken: 0.07 seconds
hive> load data local inpath 'json1.json' into table raw;
Loading data to table jsons.raw
Table jsons.raw stats: [numFiles=1, totalSize=100]
OK
Time taken: 0.121 seconds
hive> select * from raw;
OK
{"name":"Ravi","age":25}
{"name":"Rani","city":"Hyderabad"}
{"name":"Mani","age":24,"city":"Delhi"}
Time taken: 0.041 seconds, Fetched: 3 row(s)
hive> select get_json_object(line,'$.name') from raw;
OK
Ravi
Rani
Mani
Time taken: 0.052 seconds, Fetched: 3 row(s)
hive> select get_json_object(line,'$.age') from raw;
OK
25
NULL
24
Time taken: 0.052 seconds, Fetched: 3 row(s)
hive> select get_json_object(line,'$.city') from raw;
OK
NULL
Hyderabad
Delhi
Time taken: 0.045 seconds, Fetched: 3 row(s)
hive> select get_json_object(line,'$.name'),get_json_object(line,'$.age'),get_json_object(line,'$.city') from raw;
OK
Ravi 25 NULL
Rani NULL Hyderabad
Mani 24 Delhi
Time taken: 0.036 seconds, Fetched: 3 row(s)
hive> select x.* from raw lateral view json_tuple(line,'name','age','city') x as n,a,c;
OK
Ravi 25 NULL
Rani NULL Hyderabad
Mani 24 Delhi
hive> create table info (name string, age int, city string);
OK
Time taken: 0.026 seconds
hive> insert overwrite table info select x.* from raw lateral view json_tuple(line,'name','age','city') x as n,a,c;
hive> select * from info;
OK
Ravi 25 NULL
Rani NULL Hyderabad
Mani 24 Delhi
Time taken: 0.03 seconds, Fetched: 3 row(s)
get_json_object() -- udf
json_tuple() -- udtf
if any field content is empty, it will return null
cat json2.json
{"name":"Ravi","age":25,"wife":{"name":"Rani","age":24,"city":"Hyd"},"city":"Delhi"}
{"name":"Kiran","age":30,"wife":{"name":"Veni","qual":"BTech","city":"Hyd"},"city":"Hyd"}
hive> create table jraw(line string);
OK
Time taken: 0.071 seconds
hive> load data local inpath 'json2.json' into table jraw;
Loading data to table jsons.jraw
Table jsons.jraw stats: [numFiles=1, totalSize=175]
OK
Time taken: 0.117 seconds
hive> select * from jraw;
OK
{"name":"Ravi","age":25,"wife":{"name":"Rani","age":24,"city":"Hyd"},"city":"Delhi"}
{"name":"Kiran","age":30,"wife":{"name":"Veni","qual":"BTech","city":"Hyd"},"city":"Hyd"}
Time taken: 0.031 seconds, Fetched: 2 row(s)
hive> create table raw2(name string, age int, wife string, city string);
OK
Time taken: 0.026 seconds
hive> insert into table raw2 select x.* from jraw lateral view json_tuple(line,'name','age','wife','city') x as n,a,w,c;
hive> select * from raw2;
OK
Ravi 25 {"name":"Rani","age":24,"city":"Hyd"} Delhi
Kiran 30 {"name":"Veni","qual":"BTech","city":"Hyd"} Hyd
Time taken: 0.03 seconds, Fetched: 2 row(s)
hive> select * from jraw;
OK
{"name":"Ravi","age":25,"wife":{"name":"Rani","age":24,"city":"Hyd"},"city":"Delhi"}
{"name":"Kiran","age":30,"wife":{"name":"Veni","qual":"BTech","city":"Hyd"},"city":"Hyd"}
Time taken: 0.029 seconds, Fetched: 2 row(s)
hive> select * from raw2;
OK
Ravi 25 {"name":"Rani","age":24,"city":"Hyd"} Delhi
Kiran 30 {"name":"Veni","qual":"BTech","city":"Hyd"} Hyd
Time taken: 0.027 seconds, Fetched: 2 row(s)
hive> create table jinfo(hname string, hage int, hcity string, wname string, wage int, wcity string, wqual string) row format delimited fields terminated by ',';
OK
Time taken: 0.047 seconds
hive> insert into table jinfo select name, age,city,get_json_object(wife,'$.name'),get_json_object(wife,'$.age'),get_json_object(wife,'$.city'),get_json_object(wife,'$.qual') from raw2;
hive> select * from jinfo;
OK
Ravi 25 Delhi Rani 24 Hyd NULL
Kiran 30 Hyd Veni NULL Hyd BTech
Time taken: 0.027 seconds, Fetched: 2 row(s)
hive> select * from jraw;
OK
{"name":"Ravi","age":25,"wife":{"name":"Rani","age":24,"city":"Hyd"},"city":"Delhi"}
{"name":"Kiran","age":30,"wife":{"name":"Veni","qual":"BTech","city":"Hyd"},"city":"Hyd"}
Time taken: 0.035 seconds, Fetched: 2 row(s)
hive> select * from raw2;
OK
Ravi 25 {"name":"Rani","age":24,"city":"Hyd"} Delhi
Kiran 30 {"name":"Veni","qual":"BTech","city":"Hyd"} Hyd
Time taken: 0.043 seconds, Fetched: 2 row(s)
hive> select * from jinfo;
OK
Ravi 25 Delhi Rani 24 Hyd NULL
Kiran 30 Hyd Veni NULL Hyd BTech
Time taken: 0.027 seconds, Fetched: 2 row(s)
cat json3.json
{"name":"Ravi","qual":["btech","mtech","phd"]}
{"name":"Mani","qual":["bsc","mba"]}
{"name":"Rani","qual":["bsc","msc","mtech"]}
hive> create table jsraw(line string);
OK
Time taken: 0.053 seconds
hive> load data local inpath 'json3.json' into table jsraw;
Loading data to table jsons.jsraw
Table jsons.jsraw stats: [numFiles=1, totalSize=129]
OK
Time taken: 0.108 seconds
hive> select * from jsraw;
OK
{"name":"Ravi","qual":["btech","mtech","phd"]}
{"name":"Mani","qual":["bsc","mba"]}
{"name":"Rani","qual":["bsc","msc","mtech"]}
Time taken: 0.032 seconds, Fetched: 3 row(s)
hive> insert into table jsraw2 select x.* from jsraw lateral view json_tuple(line,'name','qual') x as n,q;
hive> select * from jsraw2;
OK
Ravi ["btech","mtech","phd"]
Mani ["bsc","mba"]
Rani ["bsc","msc","mtech"]
Time taken: 0.027 seconds, Fetched: 3 row(s)
hive> select split(qual,',') from jsraw2;
OK
["[\"btech\"","\"mtech\"","\"phd\"]"]
["[\"bsc\"","\"mba\"]"]
["[\"bsc\"","\"msc\"","\"mtech\"]"]
Time taken: 0.035 seconds, Fetched: 3 row(s)
hive> create table raw3 (name string, qual array);
OK
Time taken: 0.03 seconds
hive> insert into table raw3 select name, split(qual,',') from jsraw2;
hive> select * from raw3;
OK
Ravi ["[\"btech\"","\"mtech\"","\"phd\"]"]
Mani ["[\"bsc\"","\"mba\"]"]
Rani ["[\"bsc\"","\"msc\"","\"mtech\"]"]
Time taken: 0.033 seconds, Fetched: 3 row(s)
hive> select explode(qual) from raw3;
OK
["btech"
"mtech"
"phd"]
["bsc"
"mba"]
["bsc"
"msc"
"mtech"]
Time taken: 0.05 seconds, Fetched: 8 row(s)
hive> create table raw4(name string, qual string);
OK
Time taken: 0.03 seconds
hive> insert overwrite table raw4 select name, myq from raw3 lateral view explode(qual) q as myq;
hive> select * from raw4;
OK
Ravi ["btech"
Ravi "mtech"
Ravi "phd"]
Mani ["bsc"
Mani "mba"]
Rani ["bsc"
Rani "msc"
Rani "mtech"]
Time taken: 0.029 seconds, Fetched: 8 row(s)
hive> select split(qual,'"') from raw4;
OK
["[","btech",""]
["","mtech",""]
["","phd","]"]
["[","bsc",""]
["","mba","]"]
["[","bsc",""]
["","msc",""]
["","mtech","]"]
Time taken: 0.032 seconds, Fetched: 8 row(s)
hive> select split(qual,'"')[1] from raw4;
OK
btech
mtech
phd
bsc
mba
bsc
msc
mtech
Time taken: 0.039 seconds, Fetched: 8 row(s)
hive> create table jsinfo like raw4;
OK
Time taken: 0.054 seconds
hive> insert into table jsinfo select name,split(qual,'"')[1] from raw4;
hive> select * from jsinfo;
OK
Ravi btech
Ravi mtech
Ravi phd
Mani bsc
Mani mba
Rani bsc
Rani msc
Rani mtech
Time taken: 0.036 seconds, Fetched: 8 row(s)
hive> select * from jsraw;
OK
{"name":"Ravi","qual":["btech","mtech","phd"]}
{"name":"Mani","qual":["bsc","mba"]}
{"name":"Rani","qual":["bsc","msc","mtech"]}
Time taken: 0.069 seconds, Fetched: 3 row(s)
hive> select * from jsraw2;
OK
Ravi ["btech","mtech","phd"]
Mani ["bsc","mba"]
Rani ["bsc","msc","mtech"]
Time taken: 0.031 seconds, Fetched: 3 row(s)
hive> select * from raw3;
OK
Ravi ["[\"btech\"","\"mtech\"","\"phd\"]"]
Mani ["[\"bsc\"","\"mba\"]"]
Rani ["[\"bsc\"","\"msc\"","\"mtech\"]"]
Time taken: 0.031 seconds, Fetched: 3 row(s)
hive> select * from raw4;
OK
Ravi ["btech"
Ravi "mtech"
Ravi "phd"]
Mani ["bsc"
Mani "mba"]
Rani ["bsc"
Rani "msc"
Rani "mtech"]
Time taken: 0.032 seconds, Fetched: 8 row(s)
hive> select * from jsinfo;
OK
Ravi btech
Ravi mtech
Ravi phd
Mani bsc
Mani mba
Rani bsc
Rani msc
Rani mtech
Time taken: 0.05 seconds, Fetched: 8 row(s)
URL parser:
------------
[cloudera@quickstart ~]$ cat urls.txt
http://training.com/bigdata/hadoop?id=101&name=Giri&age=23&city=hyd
http://training.com/bigdata/spark?id=101&name=Diri&sex=f&city=del
http://training.com/bigdata/spark?id=102&name=Xiri&age=33&sex=f
http://training.com/bigdata/spark?id=104&name=Miri&age=23&sex=m
hive> create database urls;
OK
Time taken: 0.044 seconds
hive> use urls;
OK
Time taken: 0.008 seconds
hive> create table raw(line string);
OK
Time taken: 0.034 seconds
hive> load data local inpath 'urls.txt' into table raw;
Loading data to table urls.raw
Table urls.raw stats: [numFiles=1, totalSize=262]
OK
Time taken: 0.101 seconds
hive> select * from raw;
OK
http://training.com/bigdata/hadoop?id=101&name=Giri&age=23&city=hyd
http://training.com/bigdata/spark?id=101&name=Diri&sex=f&city=del
http://training.com/bigdata/spark?id=102&name=Xiri&age=33&sex=f
http://training.com/bigdata/spark?id=104&name=Miri&age=23&sex=m
Time taken: 0.026 seconds, Fetched: 4 row(s)
hive> select parse_url(line,'HOST') from raw;
OK
training.com
training.com
training.com
training.com
Time taken: 0.032 seconds, Fetched: 4 row(s)
hive> select parse_url(line,'PATH') from raw;
OK
/bigdata/hadoop
/bigdata/spark
/bigdata/spark
/bigdata/spark
Time taken: 0.032 seconds, Fetched: 4 row(s)
hive> select parse_url(line,'QUERY') from raw;
OK
id=101&name=Giri&age=23&city=hyd
id=101&name=Diri&sex=f&city=del
id=102&name=Xiri&age=33&sex=f
id=104&name=Miri&age=23&sex=m
Time taken: 0.035 seconds, Fetched: 4 row(s)
hive> select parse_url(line,'HOST'), parse_url(line,'PATH'), parse_url(line,'QUERY') from raw;
OK
training.com /bigdata/hadoop id=101&name=Giri&age=23&city=hyd
training.com /bigdata/spark id=101&name=Diri&sex=f&city=del
training.com /bigdata/spark id=102&name=Xiri&age=33&sex=f
training.com /bigdata/spark id=104&name=Miri&age=23&sex=m
Time taken: 0.062 seconds, Fetched: 4 row(s)
// using lateral view
hive> select x.* from raw lateral view parse_url_tuple(line,'HOST','PATH','QUERY') x as h,p,q;
OK
training.com /bigdata/hadoop id=101&name=Giri&age=23&city=hyd
training.com /bigdata/spark id=101&name=Diri&sex=f&city=del
training.com /bigdata/spark id=102&name=Xiri&age=33&sex=f
training.com /bigdata/spark id=104&name=Miri&age=23&sex=m
Time taken: 0.048 seconds, Fetched: 4 row(s)
hive> insert into table raw2 select x.* from raw lateral view parse_url_tuple(line,'HOST','PATH','QUERY') x as h,p,q;
hive> select * from raw2;
OK
training.com /bigdata/hadoop id=101&name=Giri&age=23&city=hyd
training.com /bigdata/spark id=101&name=Diri&sex=f&city=del
training.com /bigdata/spark id=102&name=Xiri&age=33&sex=f
training.com /bigdata/spark id=104&name=Miri&age=23&sex=m
Time taken: 0.029 seconds, Fetched: 4 row(s)
hive> select host,split(path,'/'),str_to_map(query,'&','=') from raw2;
OK
training.com ["","bigdata","hadoop"] {"id":"101","name":"Giri","age":"23","city":"hyd"}
training.com ["","bigdata","spark"] {"id":"101","name":"Diri","sex":"f","city":"del"}
training.com ["","bigdata","spark"] {"id":"102","name":"Xiri","age":"33","sex":"f"}
training.com ["","bigdata","spark"] {"id":"104","name":"Miri","age":"23","sex":"m"}
Time taken: 0.031 seconds, Fetched: 4 row(s)
hive> create table raw3(host string, path array, qmap map);
hive> insert into table raw3 select host,split(path,'/'),str_to_map(query,'&','=') from raw2;
hive> select * from raw3;
OK
training.com ["","bigdata","hadoop"] {"id":"101","name":"Giri","age":"23","city":"hyd"}
training.com ["","bigdata","spark"] {"id":"101","name":"Diri","sex":"f","city":"del"}
training.com ["","bigdata","spark"] {"id":"102","name":"Xiri","age":"33","sex":"f"}
training.com ["","bigdata","spark"] {"id":"104","name":"Miri","age":"23","sex":"m"}
Time taken: 0.029 seconds, Fetched: 4 row(s)
hive> create table info (host string, category string, course string, id int, name string, age int, sex string, city string);
OK
Time taken: 0.05 seconds
hive> describe info;
OK
host string
category string
course string
id int
name string
age int
sex string
city string
Time taken: 0.049 seconds, Fetched: 8 row(s)
hive> insert into table info select host, path[1],path[2],qmap['id'],qmap['name'],qmap['age'],qmap['sex'],qmap['city'] from raw3;
hive> select * from info;
OK
training.com bigdata hadoop 101 Giri 23 NULL hyd
training.com bigdata spark 101 Diri NULL f del
training.com bigdata spark 102 Xiri 33 f NULL
training.com bigdata spark 104 Miri 23 m NULL
Time taken: 0.032 seconds, Fetched: 4 row(s)
hive> select sex,count(*) as cnt from info group by sex order by cnt desc ;
f 2
m 1
NULL 1
hive> create database mydb;
OK
Time taken: 0.068 seconds
hive> use mydb;
OK
Time taken: 0.007 seconds
hive> create table mytab(a int, b int, c int);
OK
Time taken: 0.026 seconds
hive> describe mytab;
OK
a int
b int
c int
Time taken: 0.073 seconds, Fetched: 3 row(s)
hive> describe formatted mytab;
OK
# col_name data_type comment
a int
b int
c int
# Detailed Table Information
Database: mydb
Owner: cloudera
CreateTime: Tue Nov 20 21:57:37 PST 2018
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/mydb.db/mytab
Table Type: MANAGED_TABLE
Table Parameters:
transient_lastDdlTime 1542779857
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
Time taken: 0.133 seconds, Fetched: 28 row(s)
hive> describe extended mytab;
OK
a int
b int
c int
Detailed Table Information Table(tableName:mytab, dbName:mydb, owner:cloudera, createTime:1542779857, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:a, type:int, comment:null), FieldSchema(name:b, type:int, comment:null), FieldSchema(name:c, type:int, comment:null)], location:hdfs://quickstart.cloudera:8020/user/hive/warehouse/mydb.db/mytab, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{transient_lastDdlTime=1542779857}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE)
Time taken: 0.366 seconds, Fetched: 5 row(s)
hdfs dfs -ls /user/hive/warehouse/mydb.db/
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-11-20 21:57 /user/hive/warehouse/mydb.db/mytab
gedit file1.txt:
---------------
1,2,3
100,200,300
10,20,30
1000,2000,300
111,222,333
1111,2222,3333
1,2,3
11,22,33
hive> load data local inpath 'file1.txt' into table mytab;
// by default hive table expects '\001' ctrl+A as delimiter. if we pass csv the result will be :
hive> select * from mytab;
OK
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
NULL NULL NULL
Time taken: 0.416 seconds, Fetched: 8 row(s)
hive> drop table mytab;
hive> create table mytab(a int, b int, c int) row format delimited fields terminated by ',';
OK
Time taken: 0.157 seconds
hive> load data local inpath 'file1.txt' into table mytab;
Loading data to table mydb.mytab
Table mydb.mytab stats: [numFiles=1, totalSize=83]
OK
Time taken: 0.138 seconds
hive> select * from mytab;
OK
1 2 3
100 200 300
10 20 30
1000 2000 300
111 222 333
1111 2222 3333
1 2 3
11 22 33
Time taken: 0.059 seconds, Fetched: 8 row(s)
cat > file2.txt
2,3,4
20,30,40
200,300,400
2000,3000,4000
hive> use mydb;
OK
Time taken: 0.076 seconds
hive> load data local inpath 'file2.txt' into table mytab;
Loading data to table mydb.mytab
Table mydb.mytab stats: [numFiles=2, totalSize=125]
OK
Time taken: 0.556 seconds
hive> select * from mytab;
OK
1 2 3
100 200 300
10 20 30
1000 2000 300
111 222 333
1111 2222 3333
1 2 3
11 22 33
2 3 4
20 30 40
200 300 400
2000 3000 4000
Time taken: 0.498 seconds, Fetched: 12 row(s)
// homogenous
[cloudera@quickstart ~]$ cat > file4.tx
Ravi,25,BTech#MTech
Rani,26,Msc#MBA#PhD
hive> create table tabx(name string, age int, qual array) row format delimited fields terminated by ',' collection items terminated by '#';
OK
Time taken: 0.089 seconds
hive> load data local inpath 'file4.txt' into table tabx;
Loading data to table mydb.tabx
Table mydb.tabx stats: [numFiles=1, totalSize=40]
OK
Time taken: 0.161 seconds
hive> select * from tabx;
OK
Ravi 25 ["BTech","MTech"]
Rani 26 ["Msc","MBA","PhD"]
Time taken: 0.068 seconds, Fetched: 2 row(s)
// purpose of each element is different/
// hetrogeneous
cat > file5.txt
Rani,26,BTech#Nagarjuna University#2012#72
Ravi,29,Bsc#Osmania University#2011#69
hive> create table taby(name string, age int, qual struct) row format delimited fields terminated by ',' collection items terminated by '#';
OK
Time taken: 0.033 seconds
hive> load data local inpath 'file5.txt' into table taby;
Loading data to table mydb.taby
Table mydb.taby stats: [numFiles=1, totalSize=82]
OK
Time taken: 0.113 seconds
hive> select * from taby;
OK
Rani 26 {"qual":"BTech","university":"Nagarjuna University","year":2012,"percentage":72}
Ravi 29 {"qual":"Bsc","university":"Osmania University","year":2011,"percentage":69}
Time taken: 0.049 seconds, Fetched: 2 row(s)
hive> select name,age,qual.qual,qual.university, qual.year,qual.percentage from taby;
OK
Rani 26 BTech Nagarjuna University 2012 72
Ravi 29 Bsc Osmania University 2011 69
Time taken: 0.055 seconds, Fetched: 2 row(s)
cat file6.txt
Ravi,BTech$90#MTech$70,35
Mani,Bsc$80#Msc$90#MBA$60,25
hive> create table tabz(name string, qual map, age int) row format delimited fields terminated by ',' collection items terminated by '#' map keys terminated by '$';
OK
Time taken: 0.034 seconds
hive> load data local inpath 'file6.txt' into table tabz;
Loading data to table mydb.tabz
Table mydb.tabz stats: [numFiles=1, totalSize=55]
OK
Time taken: 0.121 seconds
hive> select * from tabz;
OK
Ravi {"BTech":90,"MTech":70} 35
Mani {"Bsc":80,"Msc":90,"MBA":60} 25
Time taken: 0.033 seconds, Fetched: 2 row(s)
create table tabu(a int, b int, c int) stored as 'textFile'
create table tabu(a int, b int, c int) stored as 'sequenceFile'
create table tabu(a int, b int, c int) stored as 'rc'
create table tabu(a int, b int, c int) stored as 'orc'
create table tabu(a int, b int, c int) stored as 'parquet'
hive> create index idx1 on emp(id);
hive> create view myview1 as select name,age,income from info1;
hive> create view myview2 as select * from info1 where age > 25;
hive> create table tab1 (a int, b int, c int);
hive> insert into tab1 (a,b,c) values(1,2,3);
hive> select * from tab1;
OK
1 2 3
hive> alter table tab1 add columns (d int, e int);
OK
Time taken: 0.052 seconds
hive> select * from tab1;
OK
1 2 3 NULL NULL
Time taken: 0.043 seconds, Fetched: 1 row(s)
hive> insert overwrite table tab1 select a,b,c,a+b+c, (a+b+c)/3 from tab1;
hive> select * from tab1;
OK
1 2 3 6 2
Time taken: 0.035 seconds, Fetched: 1 row(s)
Drop statements:
----------------
drop table mytab;
drop index idx1;
drop view myview1;
drop database mydb;
hive> drop database mydb;
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. InvalidOperationException(message:Database mydb is not empty. One or more tables exist.)
// Before dropping a database, delete all its contents...
cat info.txt
-------------
101,aaa,30000,m,11
102,bbb,40000,m,12
103,ccc,90000,f,13
104,ddd,10000,m,12
105,eee,20000,f,14
106,fff,20000,m,14
hive> create table emp(id int, name string, sal int, sex string, dno int) row format delimited fields terminated by ',';
hive> load data local inpath 'info.txt' into table emp;
hive> select * from emp;
OK
101 aaa 30000 m 11
102 bbb 40000 m 12
103 ccc 90000 f 13
104 ddd 10000 m 12
105 eee 20000 f 14
106 fff 20000 m 14
Time taken: 0.046 seconds, Fetched: 6 row(s)
hive> select * from emp;
OK
101 aaa 30000 m 11
102 bbb 40000 m 12
103 ccc 90000 f 13
104 ddd 10000 m 12
105 eee 20000 f 14
106 fff 20000 m 14
Time taken: 0.033 seconds, Fetched: 6 row(s)
hive> select name,sal from emp;
OK
aaa 30000
bbb 40000
ccc 90000
ddd 10000
eee 20000
fff 20000
Time taken: 0.043 seconds, Fetched: 6 row(s)
hive> select * from emp where sex='m';
OK
101 aaa 30000 m 11
102 bbb 40000 m 12
104 ddd 10000 m 12
106 fff 20000 m 14
Time taken: 0.065 seconds, Fetched: 4 row(s)
hive> select * from emp where dno=11 or dno=12;
OK
101 aaa 30000 m 11
102 bbb 40000 m 12
104 ddd 10000 m 12
Time taken: 0.045 seconds, Fetched: 3 row(s)
hive> select * from emp where dno in (11,12);
OK
101 aaa 30000 m 11
102 bbb 40000 m 12
104 ddd 10000 m 12
Time taken: 0.04 seconds, Fetched: 3 row(s)
hive> select * from emp where dno != 11 and dno!= 13;
OK
102 bbb 40000 m 12
104 ddd 10000 m 12
105 eee 20000 f 14
106 fff 20000 m 14
Time taken: 0.075 seconds, Fetched: 4 row(s)
hive> select * from emp where dno not in (11,13);
OK
102 bbb 40000 m 12
104 ddd 10000 m 12
105 eee 20000 f 14
106 fff 20000 m 14
Time taken: 0.05 seconds, Fetched: 4 row(s)
Aggregation functions:
----------------------
sum(), avg(), max(), min(), count(), corr(), stddev(), cov()
udf
udaf
udtf
hive> select sum(sal) from emp;
210000
hive> select sum(sal), avg(sal), max(sal), min(sal), count(*) from emp;
210000 35000.0 90000 10000 6
Group By:
to get aggregations separately for each data group.
single group by sex:
f ?
m ?
multi group by dno, sex:
11 f ?
11 m ?
12 f ?
12 m ?
hive> select sex,sum(sal) from emp group by sex;
f 110000
m 100000
hive> select sum(sal), avg(sal), max(sal), min(sal), count(*) from emp group by sex;
110000 55000.0 90000 20000 2
100000 25000.0 40000 10000 4
hive> select dno,sex, sum(sal),avg(sal), count(*) from emp group by dno,sex;
11 m 30000 30000.0 1
12 m 50000 25000.0 2
13 f 90000 90000.0 1
14 f 20000 20000.0 1
14 m 20000 20000.0 1
hive> select dno, sum(sal) from emp where dno in (11,12,13) group by dno;
11 30000
12 50000
13 90000
hive> select dno, sum(sal) from emp group by dno having dno in (11,12,13);
11 30000
12 50000
13 90000
//whenever you want to apply groupby along with conditions, always use having
// performance is greater when we use having with groupby.
// reducer's burden is less here
In having filter, only grouping column and aggregated function is valid
Invalid query:
--------------
select dno, sum(sal) from info group by dno having city in ('Hyd','Del','Pune')
// It's invalid because, city is not in selected columns
Valid Query:
------------
select dno, sum(sal) from info where city in ('Hyd','Del','Pune') group by dno;
In having filter, only grouping column and aggregated function is valid.
select dno, sum(sal) from info
where city in ('Hyd','Del',Pune')
group by dno having dno in (11,12,13)
cat atm.txt
101,1000
102,2000
101,2000
101,3000
102,8000
103,1000
104,9000
hive> create table atm(acno int, amt int) row format delimited fields terminated by ',';
OK
Time taken: 0.129 seconds
hive> load data local inpath 'atm.txt' into table atm;
Loading data to table mydb.atm
Table mydb.atm stats: [numFiles=1, totalSize=63]
OK
Time taken: 0.162 seconds
hive> select * from atm;
OK
101 1000
102 2000
101 2000
101 3000
102 8000
103 1000
104 9000
Time taken: 0.026 seconds, Fetched: 7 row(s)
// get acnos, who did more than 1 transaction.
hive> select acno,count(*) from atm group by acno having count(*) > 1;
101 3
102 2
hive> select acno from atm group by acno having count(*) > 1;
101
102
hive> select acno, sum(amt) from atm group by acno;
101 6000
102 10000
103 1000
104 9000
hive> select acno, sum(amt) as tot from atm group by acno having tot > 5000;
101 6000
102 10000
104 9000
Eliminating Duplicate Rows:
---------------------------
cat samp.txt;
Ravi,30
Mani,20
Ravi,30
Ravi,30
Mani,20
Giri,50
Giri,50
Giri,50
hive> create table samp(name string, age int) row format delimited fields terminated by ',';
OK
Time taken: 0.04 seconds
hive> load data local inpath 'samp.txt' into table samp;
Loading data to table mydb.samp
Table mydb.samp stats: [numFiles=1, totalSize=64]
OK
Time taken: 0.177 seconds
hive> select * from samp;
OK
Ravi 30
Mani 20
Ravi 30
Ravi 30
Mani 20
Giri 50
Giri 50
Giri 50
Time taken: 0.026 seconds, Fetched: 8 row(s)
hive> select distinct(name), age from samp;
Giri 50
Mani 20
Ravi 30
hive> load data local inpath 'samp2.txt' overwrite table samp;
hive> select name,age,count(*) from samp group by name,age;
Giri 50 3
Mani 20 2
Ravi 30 3
hive> select name,age from samp group by name, age;
Giri 50
Mani 20
Ravi 30
cat tab1.txt
101,1000
102,null
103,2000
104,null
hive> create table tab1 (id int, sal int) row format delimited fields terminated by ',';
OK
Time taken: 0.025 seconds
hive> load data local inpath 'tab1.txt' into table tab1;
Loading data to table mydb.tab1
Table mydb.tab1 stats: [numFiles=1, totalSize=36]
OK
Time taken: 0.118 seconds
hive> select * from tab1;
OK
101 1000
102 NULL
103 2000
104 NULL
Time taken: 0.052 seconds, Fetched: 4 row(s)
// including null values
hive> select count(*) from tab1;
4
// excluding null values
hive> select count(sal) from tab1;
2
hive> select count(*) from tab1 where sal is null;
2
Hive Unions:
------------
To merge datasets (tables)
In SQL,
2 types of Unions
union - Doesn't allow duplicates
union all - allows duplicate rows.
In HQL,
we have only "union all"
emp1
emp2
emp3
case#1:
schema of both tables should be same.
create table newTab like emp1;
insert into [overwrite] table newTab
select * from emp1
union all
select * from emp2
union all
select * from emp3
case#2:
schema of tables are different
tab1 -> name,city
tab2 -> city,name
select name,city from tab1
union all
select name,city from tab2
// keep the column order of all the query should be same
case#3:
if tables have different new fields.
a field is available in Table#1
but the different field is available in Table#2
tabX -> name,city,sex (age is missing here)
tabY -> name,city,age (sex is missing here)
create tabel tabZ (name string, age int, sex string, city string);
insert into table tabZ
select name,null as age, sex, city from tabX
union all
select name,age,null as sex, city from tabY
case#4:
file1 -> 101,aaaa,10000 (comma separated)
....
file2 -> 201,bbbb,30000 (comma separated)
....
file3 -> 301 40000 aaaaa (tab separated)
for file#1, file#2 -> structure is same (comma separated)
for file#3 -> structure is different (tab separated)
structure means field names and order of them.
way#1:
create table tab1(id int, name string, sal int) row format delimited fields terminated by ',';
create table tab2(id int, sal int, name string) row format delimited fields terminated by '\t';
load data local inpath 'file1.txt' into table tab1;
load data local inpath 'file2.txt' into table tab1;
load data local inpath 'file3.txt' into table tab2;
tab1 -> id,name,sal
tab2 -> id,sal,name
same column with different column order
create table info like tab1;
insert into table info
select id,name,sal from tab1
union all
select id,name,sal from tab2
file1.txt,file2.txt content pushed into tab1
file3.txt content pushed into tab2
finally we merged tab1 and tab2 together to make info.
tab1 --> 1 Lakh rows
tab2 --> 10000 rows
Join in Hive:
-------------
Inner joins
Outer Joins
Left outer
Right outer
Full Outer
Inner join:
select l.x, r.x from tab1 l join tab2 r on (l.x = r.x);
Left outer:
select l.x, r.x from tab1 l left outer join tab2 r on (l.x = r.x);
Right outer:
select l.x, r.x from tab1 l right outer join tab2 r on (l.x = r.x);
Full outer:
select l.x, r.x from tab1 l full outer join tab2 r on (l.x = r.x);
cat emp
101,aaaa,40000,m,11
102,bbbb,50000,f,12
103,cccc,90000,m,12
104,dddd,10000,f,13
105,eeee,20000,m,11
106,ffff,20000,m,14
cat dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,admin,del
21,production,hyd
joins are good compared to sub-queries
but if we have 1 Crore records, joins are bad
hive> create database joins;
OK
Time taken: 0.483 seconds
hive> use joins;
OK
Time taken: 0.026 seconds
hive> create table emp(id int, name string, sal int, sex string, dno int) row format delimited fields terminated by ',';
hive> load data local inpath 'emp' into table emp;
hive> select * from emp;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.435 seconds, Fetched: 8 row(s)
hive> create table dept (dno int, dname string, dloc string) row format delimited fields terminated by ',';
OK
Time taken: 0.1 seconds
hive> load data local inpath 'dept' into table dept;
Loading data to table joins.dept
Table joins.dept stats: [numFiles=1, totalSize=73]
OK
Time taken: 0.159 seconds
hive> select * from dept;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.059 seconds, Fetched: 5 row(s)
hive> select * from emp;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.091 seconds, Fetched: 8 row(s)
hive> select * from dept;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.039 seconds, Fetched: 5 row(s)
hive> create table edinfo(id int, name string, sal int, sex string, dname string, dloc string, dno1 int, dno2 int) row format delimited fields terminated by ',';
OK
Time taken: 0.063 seconds
hive> insert overwrite table edinfo select id,name,sal,sex,dname,dloc,l.dno,r.dno from emp l full outer join dept r on (l.dno = r.dno);
hive> select * from edinfo;
OK
109 nnnn 50000 f marketing hyd 11 11
105 eeee 20000 m marketing hyd 11 11
101 aaaa 40000 m marketing hyd 11 11
103 cccc 90000 m hr del 12 12
102 bbbb 50000 f hr del 12 12
104 dddd 10000 f finance hyd 13 13
106 ffff 20000 m NULL NULL 14 NULL
107 mmmm 60000 m NULL NULL 15 NULL
NULL NULL NULL NULL admin del NULL 20
NULL NULL NULL NULL production hyd NULL 21
hdfs dfs -cat /user/hive/warehouse/joins.db/edinfo/000000_0
109,nnnn,50000,f,marketing,hyd,11,11
105,eeee,20000,m,marketing,hyd,11,11
101,aaaa,40000,m,marketing,hyd,11,11
103,cccc,90000,m,hr,del,12,12
102,bbbb,50000,f,hr,del,12,12
104,dddd,10000,f,finance,hyd,13,13
106,ffff,20000,m,\N,\N,14,\N
107,mmmm,60000,m,\N,\N,15,\N
\N,\N,\N,\N,admin,del,\N,20
\N,\N,\N,\N,production,hyd,\N,21
hive> select dloc, sum(sal) from edinfo group by dloc;
NULL 80000
del 140000
hyd 120000
[cloudera@quickstart ~]$ cat emp
101,aaaa,40000,m,11
102,bbbb,50000,f,12
103,cccc,90000,m,12
104,dddd,10000,f,13
105,eeee,20000,m,11
106,ffff,20000,m,14
107,mmmm,60000,m,15
109,nnnn,50000,f,11
[cloudera@quickstart ~]$ cat dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,admin,del
21,production,hyd
hive> select * from emp;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.041 seconds, Fetched: 8 row(s)
hive> select * from dept;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.036 seconds, Fetched: 5 row(s)
hive> create table swengineers (id int, name string, sal int, sex string, pid int) row format delimited fields terminated by ',';
OK
Time taken: 0.155 seconds
hive> load data local inpath 'emp' into table swengineers;
Loading data to table joins.swengineers
Table joins.swengineers stats: [numFiles=1, totalSize=160]
OK
Time taken: 0.103 seconds
hive> select * from swengineers;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.028 seconds, Fetched: 8 row(s)
hive> create table projects (pid int, pname string, loc string) row format delimited fields terminated by ',';
OK
Time taken: 0.032 seconds
hive> load data local inpath 'dept' into table projects;
Loading data to table joins.projects
Table joins.projects stats: [numFiles=1, totalSize=73]
OK
Time taken: 0.11 seconds
hive> select * from projects;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.03 seconds, Fetched: 5 row(s)
hive> select * from projects;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.03 seconds, Fetched: 5 row(s)
hive> select * from swengineers;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.032 seconds, Fetched: 8 row(s)
hive> select * from projects;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.077 seconds, Fetched: 5 row(s)
11,12,13 guys are working in projects
14,15 are recruited but not into projects
20,12 - project started but no employees recruited
working team -- inner join
11,12,13
bench team
14,15
bench project
20,12
full outer join - i don't want to miss any info
hive> create table prengineers (pid1 int, pid2 int, sal int);
hive> insert overwrite table prengineers select l.pid, r.pid, sal from swengineers l full outer join projects r on (l.pid = r.pid);
hive> select * from prengineers;
OK
11 11 50000
11 11 20000
11 11 40000
12 12 90000
12 12 50000
13 13 10000
14 NULL 20000
15 NULL 60000
NULL 20 NULL
NULL 21 NULL
Time taken: 0.041 seconds, Fetched: 10 row(s)
first 5 records are working engineers rows
next 2 bench team
next 2 bench project
hive> create table transformed (stat string, sal int);
hive> insert overwrite table transformed select if(pid2 is null, 'BenchTeam', if(pid1 is null,'BenchProject','Working')), sal from prengineers;
hive> select * from transformed;
OK
Working 50000
Working 20000
Working 40000
Working 90000
Working 50000
Working 10000
BenchTeam 20000
BenchTeam 60000
BenchProject NULL
BenchProject NULL
hive> insert overwrite table transformed select stat, if(sal is null, 0,sal) from transformed;
hive> select * from transformed;
OK
Working 50000
Working 20000
Working 40000
Working 90000
Working 50000
Working 10000
BenchTeam 20000
BenchTeam 60000
BenchProject 0
BenchProject 0
Time taken: 0.03 seconds, Fetched: 10 row(s)
hive> create table prsummary(stat string, tot int) row format delimited fields terminated by ',';
hive> insert overwrite table prsummary select stat, sum(sal) from transformed group by stat;
hive> select * from prsummary;
OK
BenchProject 0
BenchTeam 80000
Working 260000
Time taken: 0.03 seconds, Fetched: 3 row(s)
hive> select * from swengineers;
OK
101 aaaa 40000 m 11
102 bbbb 50000 f 12
103 cccc 90000 m 12
104 dddd 10000 f 13
105 eeee 20000 m 11
106 ffff 20000 m 14
107 mmmm 60000 m 15
109 nnnn 50000 f 11
Time taken: 0.042 seconds, Fetched: 8 row(s)
hive> select * from projects;
OK
11 marketing hyd
12 hr del
13 finance hyd
20 admin del
21 production hyd
Time taken: 0.029 seconds, Fetched: 5 row(s)
hive> select * from prengineers;
OK
11 11 50000
11 11 20000
11 11 40000
12 12 90000
12 12 50000
13 13 10000
14 NULL 20000
15 NULL 60000
NULL 20 NULL
NULL 21 NULL
Time taken: 0.031 seconds, Fetched: 10 row(s)
hive> select * from transformed;
OK
Working 50000
Working 20000
Working 40000
Working 90000
Working 50000
Working 10000
BenchTeam 20000
BenchTeam 60000
BenchProject 0
BenchProject 0
Time taken: 0.033 seconds, Fetched: 10 row(s)
hive> select * from prsummary;
OK
BenchProject 0
BenchTeam 80000
Working 260000
Time taken: 0.042 seconds, Fetched: 3 row(s)
we don't want to miss anything - so we used full outer join
OLTP - full outer join is not recommended
Batch Processing, analytics - full outer is recommended
cartersian product is not recommended in OLTP
Cartesian product is widely used in Batch Processing.
-----------------------------------------------------------------
Hive
To logon to mysql via Command Line Interface:
mysql -u root -pcloudera
To List Databases:
mysql> show databases;
+--------------------+
| Database |
+--------------------+
| information_schema |
| cm |
| firehose |
| hue |
| metastore |
| mysql |
| nav |
| navms |
| oozie |
| retail_db |
| rman |
| sentry |
+--------------------+
12 rows in set (0.11 sec)
Inside the Hive.
How to create a Database
hive> create database sample_programs;
OK
Time taken: 3.196 seconds
To list databases within Hive:
hive> show databases;
OK
default
sample_programs
Hive Database, Tables will be stored in the following hdfs location:
hdfs dfs -ls /user/hive/warehouse
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-07-25 03:27 /user/hive/warehouse/sample_programs.db
open current DB:
use sample_programs;
To create a table in Hive within sample_programs db:
create table employees (std int, fname string, lname string, age int, mob string, city string) row format delimited fields terminated by ',';
To copy employees.txt file into /user/hive:
hdfs dfs -put /home/cloudera/employees.txt /user/hive/employees.txt
To view the content of employees.txt in command line interface:
hdfs dfs -cat /user/hive/employees.txt
001,Rajiv,Reddy,21,988383,Hyderabad
002,Siddharth,Malhothra,22,9932323,Kolkatta
003,Rajesh,Rakul,22,999333,Kolkatta
004,Preeti,Singh,21,9234243,Hyderabad
005,Trupthi,Mohanthy,23,988833,Bhuwaneshwar
006,Archana,Mishra,24,988323,Chennai
007,Komal,Nayak,23,98888,Chennai
008,Bharathi,Nambiar,22,9888888,Hyderabad
009,Bharathi,Nambiar,21,9888888,Chennai
005,Trupthi,Mohanthy,21,988833,Bhuwaneshwar
To load data into existing hive table:
hive> load data inpath '/user/hive/employees.txt' into table employees;
Loading data to table sample_programs.employees
Table sample_programs.employees stats: [numFiles=1, totalSize=394]
OK
To display the content of a table
hive> select * from employees;
OK
1 Rajiv Reddy 21 988383 Hyderabad
2 Siddharth Malhothra 22 9932323 Kolkatta
3 Rajesh Rakul 22 999333 Kolkatta
4 Preeti Singh 21 9234243 Hyderabad
5 Trupthi Mohanthy 23 988833 Bhuwaneshwar
6 Archana Mishra 24 988323 Chennai
7 Komal Nayak 23 98888 Chennai
8 Bharathi Nambiar 22 9888888 Hyderabad
9 Bharathi Nambiar 21 9888888 Chennai
5 Trupthi Mohanthy 21 988833 Bhuwaneshwar
Time taken: 0.603 seconds, Fetched: 10 row(s)
Now check /user/hive/warehouse:
hdfs dfs -ls /user/hive/warehouse/sample_programs.db/employees
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 394 2018-07-25 03:39 /user/hive/warehouse/sample_programs.db/employees/employees.txt
hdfs dfs -cat /user/hive/warehouse/sample_programs.db/employees/employees.txt
001,Rajiv,Reddy,21,988383,Hyderabad
002,Siddharth,Malhothra,22,9932323,Kolkatta
003,Rajesh,Rakul,22,999333,Kolkatta
004,Preeti,Singh,21,9234243,Hyderabad
005,Trupthi,Mohanthy,23,988833,Bhuwaneshwar
006,Archana,Mishra,24,988323,Chennai
007,Komal,Nayak,23,98888,Chennai
008,Bharathi,Nambiar,22,9888888,Hyderabad
009,Bharathi,Nambiar,21,9888888,Chennai
005,Trupthi,Mohanthy,21,988833,Bhuwaneshwar
Copy women.txt from local linux to hdfs:
hdfs dfs -put /home/cloudera/women.txt /user/hive/women.txt
Create a folder in hive named as women_employees:
hdfs dfs -mkdir /user/hive/women_employees
Move a file between hdfs locations:
hdfs dfs -mv /user/hive/women.txt /user/hive/women_employees/women.txt
To view the content of women.txt which is stored in hdfs
hdfs dfs -cat /user/hive/women_employees/women.txt
001,Nithya,Duraisamy,31,Manager,9586452156,Hyderabad
002,Nandhini,Babu,28,Assistant Manager,Delhi
003,Madhuri,Nathan,51,VP,999333323,Hyderabad
004,Kavitha,Manoharan,45,AVP,992342344,Hyderabad
005,Vijaya,Kandhasamy,45,AVP,23452344,Noida
006,Aarthi,Raj,28,Assistant Manager,99234234,Chennai
007,Lavanya,Sankar,23,Senior Engineer,9923444,Chennai
008,Meena,Baskar,51,VP,2342344444,Hyderabad
009,Gayathri,Raghu,22,Engineer,323423444,Chennai
010,Thenmozhi,Rajan,45,AVP,234234234,Noida
Cretae external table:
create external table women_employees(sid int, fname string, lname string, age int, mob string, city string) row format delimited fields terminated by ',' location '/user/hive/women_employees';
// Here we need to specify hdfs folder name not a file name.
Change current database:
hive> use sample_programs;
OK
Time taken: 0.02 seconds
Display tables in sample_programs database:
hive> show tables;
OK
employees
women_employees
Time taken: 0.026 seconds, Fetched: 2 row(s)
Display the content of a table:
hive> select * from women_employees;
OK
1 Nithya Duraisamy 31 Manager 9586452156
2 Nandhini Babu 28 Assistant Manager Delhi
3 Madhuri Nathan 51 VP 999333323
4 Kavitha Manoharan 45 AVP 992342344
5 Vijaya Kandhasamy 45 AVP 23452344
6 Aarthi Raj 28 Assistant Manager 99234234
7 Lavanya Sankar 23 Senior Engineer 9923444
8 Meena Baskar 51 VP 2342344444
9 Gayathri Raghu 22 Engineer 323423444
10 Thenmozhi Rajan 45 AVP 234234234
NULL NULL NULL NULL NULL NULL
Time taken: 0.097 seconds, Fetched: 11 row(s)
hive>
Load data into table:
load data inpath '/user/hive/women_employees/women.txt' into table women_employees;
Loading data to table sample_programs.women_employees
Table sample_programs.women_employees stats: [numFiles=1, numRows=0, totalSize=480, rawDataSize=0]
OK
Time taken: 0.772 seconds
External table info is not there in warehouse location
hdfs dfs -ls /user/hive/warehouse/sample_programs.db
Found 1 items
drwxrwxrwx - cloudera supergroup 0 2018-07-25 03:42 /user/hive/warehouse/sample_programs.db/employees
It has internal table (employees) information only
creating partition in hive table:
create table emp(fname string, mob string) PARTITIONED BY (city string);
set the following property in hive:
set hive.exec.dynamic.partition.mode=nonstrict;
Take rows from employees table and put them into emp PARTITIONED table.
insert overwrite table emp PARTITION(city) SELECT fname,mob,city from employees;
Loading partition {city=Hyderabad}
Loading partition {city=Bhuwaneshwar}
Loading partition {city=Chennai}
Loading partition {city=Kolkatta}
Time taken for adding to write entity : 2
Partition sample_programs.emp{city=Bhuwaneshwar} stats: [numFiles=1, numRows=2, totalSize=30, rawDataSize=28]
Partition sample_programs.emp{city=Chennai} stats: [numFiles=1, numRows=3, totalSize=44, rawDataSize=41]
Partition sample_programs.emp{city=Hyderabad} stats: [numFiles=1, numRows=3, totalSize=45, rawDataSize=42]
Partition sample_programs.emp{city=Kolkatta} stats: [numFiles=1, numRows=2, totalSize=32, rawDataSize=30]
MapReduce Jobs Launched:
See the partition folders in hdfs:
hdfs dfs -ls /user/hive/warehouse/sample_programs.db/emp/
Found 4 items
drwxrwxrwx - cloudera supergroup 0 2018-07-25 04:44 /user/hive/warehouse/sample_programs.db/emp/city=Bhuwaneshwar
drwxrwxrwx - cloudera supergroup 0 2018-07-25 04:44 /user/hive/warehouse/sample_programs.db/emp/city=Chennai
drwxrwxrwx - cloudera supergroup 0 2018-07-25 04:44 /user/hive/warehouse/sample_programs.db/emp/city=Hyderabad
drwxrwxrwx - cloudera supergroup 0 2018-07-25 04:44 /user/hive/warehouse/sample_programs.db/emp/city=Kolkatta
hive> select * from emp;
OK
Trupthi 988833 Bhuwaneshwar
Trupthi 988833 Bhuwaneshwar
Archana 988323 Chennai
Komal 98888 Chennai
Bharathi 9888888 Chennai
Rajiv 988383 Hyderabad
Preeti 9234243 Hyderabad
Bharathi 9888888 Hyderabad
Siddharth 9932323 Kolkatta
Rajesh 999333 Kolkatta
Create a bucket in hive:
create table ep(fname string,mob string, city string) CLUSTERED BY(city) into 3 buckets row format delimited fields terminated by ',';
Load data into ep:
from emp insert overwrite table ep SELECT fname,mob,city;
Display the content from ep table:
select * from ep;
OK
Trupthi 988833 Bhuwaneshwar
Trupthi 988833 Bhuwaneshwar
Archana 988323 Chennai
Komal 98888 Chennai
Bharathi 9888888 Chennai
Rajiv 988383 Hyderabad
Preeti 9234243 Hyderabad
Bharathi 9888888 Hyderabad
Siddharth 9932323 Kolkatta
Rajesh 999333 Kolkatta
Time taken: 0.07 seconds, Fetched: 10 row(s)
To view bucket files :
hdfs dfs -ls /user/hive/warehouse/sample_programs.db/ep
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 249 2018-07-25 04:54 /user/hive/warehouse/sample_programs.db/ep/000000_0
-rwxrwxrwx 1 cloudera supergroup 249 2018-07-25 04:54 /user/hive/warehouse/sample_programs.db/ep/000001_0
-rwxrwxrwx 1 cloudera supergroup 249 2018-07-25 04:54 /user/hive/warehouse/sample_programs.db/ep/000002_0
Create a view in hive:
hive> create view age as select * from employees where age > 22;
OK
Time taken: 0.193 seconds
Query data against view:
hive> select * from age;
OK
5 Trupthi Mohanthy 23 988833 Bhuwaneshwar
6 Archana Mishra 24 988323 Chennai
7 Komal Nayak 23 98888 Chennai
2 types of indexes in hive
compact index, bit map index
Make bit map index:
hive> create index mobile on table employees(mob) as 'bitmap' with deferred rebuild;
Make compact index:
hive> create index city on table employees(city) as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with deferred rebuild;
To view list of indexes created earlier:
hive>show formatted index on employees;
idx_name tab_name col_names idx_tab_name idx_type comment
mobile employees mob sample_programs__employees_mobile__ bitmap
city employees city sample_programs__employees_city__ compact
How to drop an existing index:
hive> drop index mobile on employees;
hive> show formatted index on employees; // Here mobile index is not there.
idx_name tab_name col_names idx_tab_name idx_type comment
city employees city sample_programs__employees_city__compact
Time taken: 0.083 seconds, Fetched: 4 row(s)
hive>
Sort order:
select * from employees order by age;
5 Trupthi Mohanthy 21 988833 Bhuwaneshwar
9 Bharathi Nambiar 21 9888888 Chennai
4 Preeti Singh 21 9234243 Hyderabad
1 Rajiv Reddy 21 988383 Hyderabad
8 Bharathi Nambiar 22 9888888 Hyderabad
3 Rajesh Rakul 22 999333 Kolkatta
2 Siddharth Malhothra 22 9932323 Kolkatta
7 Komal Nayak 23 98888 Chennai
5 Trupthi Mohanthy 23 988833 Bhuwaneshwar
6 Archana Mishra 24 988323 Chennai
Grouping operation:
select age,count(*) from employees group by age;
21 4
22 3
23 2
24 1
Sort order:
select * from employees order by fname desc;
5 Trupthi Mohanthy 21 988833 Bhuwaneshwar
5 Trupthi Mohanthy 23 988833 Bhuwaneshwar
2 Siddharth Malhothra 22 9932323 Kolkatta
1 Rajiv Reddy 21 988383 Hyderabad
3 Rajesh Rakul 22 999333 Kolkatta
4 Preeti Singh 21 9234243 Hyderabad
7 Komal Nayak 23 98888 Chennai
9 Bharathi Nambiar 21 9888888 Chennai
8 Bharathi Nambiar 22 9888888 Hyderabad
6 Archana Mishra 24 988323 Chennai
hive> select * from employees sort by fname desc;
5 Trupthi Mohanthy 21 988833 Bhuwaneshwar
5 Trupthi Mohanthy 23 988833 Bhuwaneshwar
2 Siddharth Malhothra 22 9932323 Kolkatta
1 Rajiv Reddy 21 988383 Hyderabad
3 Rajesh Rakul 22 999333 Kolkatta
4 Preeti Singh 21 9234243 Hyderabad
7 Komal Nayak 23 98888 Chennai
9 Bharathi Nambiar 21 9888888 Chennai
8 Bharathi Nambiar 22 9888888 Hyderabad
6 Archana Mishra 24 988323 Chennai
To view the structure of a table:
describe employees;
std int
fname string
lname string
age int
mob string
city string
Hive Aru
-----------
To logon to Hive:
$ hive
To clear the screen:
hive > !clear;
To display all databases:
hive > show databases;
To create a database:
hive > create database batch_may;
To create a tabele:
hive > create table student (id int, name string, location string) row format delimited fields terminated by ',' stored as textFile;
To view the schema of a table:
hive > desc formatted student;
create a folder in hdfs named USECASE
$ hdfs dfs -mkdir /USECASE
create student.txt file in local:
$ cat > student.txt
001,Rajiv,Hyd
002,Siddhu,Kol
003,Rajesh,Del,
004,Dinesh,Chen,
005,Kumar,Mum
006,Ganesh,Aur
007,Ram,Blr
^C
copy student.txt file from local to hdfs
$ hdfs dfs -put student.txt /USECASE/student.txt
Display the uploaded file content in hdfs:
hdfs dfs -cat /USECASE/student.txt
001,Rajiv,Hyd
002,Siddhu,Kol
003,Rajesh,Del,
004,Dinesh,Chen,
005,Kumar,Mum
006,Ganesh,Aur
007,Ram,Blr
To load data to student table of batch_may db of Hive:
hive> load data inpath '/USECASE/student.txt' into table student;
Location of data text file in hdfs :
/user/hive/warehouse/student
RDBMS:
Schema on Write
Hive:
Schema on Read
No update,Transactions, Indexes
HDFS environment setup:
Making folder and subfolder and copy local linux file into hdfs:
$ hdfs dfs -mkdir /USECASE
$ hdfs dfs -mkdir /USECASE/student_ext
$ hdfs dfs -put student.txt /USECASE/student_ext/
Upload file from local to hdfs :
$ hdfs dfs -put student.txt /USECASE/student_ext.txt
Make External table in hive:
create external table student_ext (id int, name string, location string) row format delimited fields terminated by ',' stored as textfile location '/USECASE/student_ext';
select * from student_ext;
OK
1 Rajiv Hyd
2 Siddhu Kol
3 Rajesh Del
4 Dinesh Chen
5 Kumar Mum
6 Ganesh Aur
7 Ram Blr
Create one more file in local linux :
cat > student1.txt
008,Arivu,Mdr
009,Raji,Pltr
010,Selvi,Chn
Create one more file in local linux :
cat > student2.txt
011,Arun,Lnd
012,Meena,Karai
Copy both files into hdfs from local linux;
[cloudera@quickstart ~]$ hdfs dfs -put student1.txt /USECASE/student_ext/
[cloudera@quickstart ~]$ hdfs dfs -put student2.txt /USECASE/student_ext/
Now run the same select query in hive again:
select * from student_ext;
OK
1 Rajiv Hyd
2 Siddhu Kol
3 Rajesh Del
4 Dinesh Chen
5 Kumar Mum
6 Ganesh Aur
7 Ram Blr // 7 rows taken from student.txt
8 Arivu Mdr
9 Raji Pltr
10 Selvi Chn // 3 rows taken student1.txt
11 Arun Lnd
12 Meena Karai // 3 rows taken from student2.txt
Internal table's details:
describe formatted student;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/student
Table Type: MANAGED_TABLE
External table's details:
describe formatted student_ext;
Location: hdfs://quickstart.cloudera:8020/USECASE/student_ext
Table Type: EXTERNAL_TABLE
Here we are going to create a local text file and also going to read local files within hive environment
after reading the file hive will put the content into warehouse (internal to hive)
Create a local file in linux:
cat > mylocal.txt
Arun,Sanjana,Dubshmash
Trichy,Ramesh,TikTok
Chitra,Kajal,Musically
Randy,Kichdy,Youtube
Amith,Agarwal,Blogger
hive>!clear;
Create a regular table in hive:
hive> create table myTableloc (FirstName string, LastName String,SocialMedia string) row format delimited fields terminated by ',' stored as textfile;
Load local linux mylocal.txt file into hive :
hive> load data local inpath '/home/cloudera/mylocal.txt' into table myTableloc;
hive> select * from myTableloc;
OK
Arun Sanjana Dubshmash
Trichy Ramesh TikTok
Chitra Kajal Musically
Randy Kichdy Youtube
Amith Agarwal Blogger
hive> describe formatted myTableloc;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/mytableloc
Take data from existing table and make new table:
hive> create table myTableloc1 as select * from myTableloc;
hive> select * from myTableloc1;
OK
Arun Sanjana Dubshmash
Trichy Ramesh TikTok
Chitra Kajal Musically
Randy Kichdy Youtube
Amith Agarwal Blogger
Change the name of the table in hive:
hive> alter table mytableloc1 rename to my1;
Rename a table:
hive> show tables;
OK
mytableloc
mytableloc1 // we are goint to rename this table
student
student_ext
Time taken: 0.022 seconds, Fetched: 4 row(s)
hive> alter table mytableloc1 rename to my1;
OK
Time taken: 0.262 seconds
hive> show tables;
OK
my1 // new name applied
mytableloc
student
student_ext
How to rename a column?
hive> alter table mytableloc change socialmedia media string;
describe mytableloc;
OK
firstname string
lastname string
socialmedia string
Time taken: 0.081 seconds, Fetched: 3 row(s)
hive> alter table mytableloc change socialmedia media string;
OK
Time taken: 0.215 seconds
hive>
hive> describe mytableloc;
OK
firstname string
lastname string
media string // after renaming socialmedia column into media
How to change data type of existing column?
hive> describe student;
id int // current data type is int
name string
location string
hive> alter table student change id student_id string;
hive> describe student;
student_id string // now data type changed into string
name string
location string
Drop a table in Hive:
if we drop internal table, both schema and data stored in warehouse folder will be deleted permanently
if we drop external table, only schema will be deleted. and hdfs file(s) wont be affected
hive> show tables;
OK
my1 // Going to drop it
mytableloc
student
student_ext
hive> describe my1;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/my1
In hdfs, File is there
$ hdfs dfs -ls /user/hive/warehouse/my1Found 1 items
-rwxrwxrwx 1 cloudera supergroup 110 2018-08-29 03:02 /user/hive/warehouse/my1/000000_0
Going to drop internal table :
hive> drop table my1;
After drop a table physical file removed,
hdfs dfs -ls /user/hive/warehouse/my1
ls: `/user/hive/warehouse/my1': No such file or directory
hive> show tables;
OK
mytableloc
student
student_ext // Going to delete it immediate
// my1 is missing here as we deleted it.
We are going to delete external table:
hive> describe formatted student_ext;
Location: hdfs://quickstart.cloudera:8020/USECASE/student_ext
Display the folder of hdfs :
$ hdfs dfs -ls /USECASE/student_ext
Found 3 items
-rw-r--r-- 1 cloudera supergroup 103 2018-08-28 22:10 /USECASE/student_ext/student.txt
-rw-r--r-- 1 cloudera supergroup 42 2018-08-29 02:27 /USECASE/student_ext/student1.txt
-rw-r--r-- 1 cloudera supergroup 29 2018-08-29 02:30 /USECASE/student_ext/student2.txt
To drop external table:
hive> drop table student_ext;
hive> show tables;
OK
mytableloc
student
// student_ext table is dropped and it is not in the list after dropping
Just now we dropped a table in hive then also the folder and it's corresponding files will be residing in hdfs:
hdfs dfs -ls /USECASE/student_ext
hive> truncate table student_ext
cannot truncate non managed table // external table cannot be truncated
hive> truncate table student
internal table can be truncated.
****We can truncate only internal (managed) table but not external table
Create Table AS
CTAS:
hive> create table student as select * from default.student;
hive> create table myTableloc as select * from default.mytableloc;
Map Join:
Keep the small table in memory and do perform join to get better performance
If we want to join Student and Department table, definitely Department table is a small table which may have 20 Rows. But Student table may have
large number of rows like 10 Mn.
Instead of hitting disk I/O for Department table again and again, we can tell Hive to keep Department table in Memory
that will save seek time , disk I/O against Department table.
hive> select /*+ MAPJOIN(Department)*/ * from Student inner join Department on Department.ID = Student.DepartmentID
Namespaces Databases
schemas Tables
Directories Partitions
Files Buckets or clusters
Creating a new table with partition enabled by City:
hive> create table student_part(id string, name string) partitioned by (city string) row format delimited fields terminated by ',' stored as textFile;
describe formatted student_part:
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/batch_may.db/student_part // before renaming
Change the name of the table:
hive> alter table student_part rename to person_part;
describe person_part; // describe which includes partition info too
OK
id string
name string
city string
# Partition Information
# col_name data_type comment
city string
describe formatted person_part;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/batch_may.db/person_part
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/batch_may.db/
drwxrwxrwx - cloudera supergroup 0 2018-08-29 18:34 /user/hive/warehouse/batch_may.db/mytableloc
drwxrwxrwx - cloudera supergroup 0 2018-08-30 01:17 /user/hive/warehouse/batch_may.db/person_part // after renaming
drwxrwxrwx - cloudera supergroup 0 2018-08-29 18:32 /user/hive/warehouse/batch_may.db/student
[cloudera@quickstart ~]$ hdfs dfs -ls /user/hive/warehouse/batch_may.db/person_part/
// currently no records found so folder is empty
set the following in hive:
hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode = nonstrict;
step #1:
create a normal table in hive:
hive> create table student(id int, name string, location string);
insert some rows in student table of batch_may db;
hive> insert into student(id,name,location) values (101,'sara','MDR');
hive> insert into student(id,name,location) values (102,'saran','CHN');
hive> insert into student(id,name,location) values (103,'Maran','TRC');
hive> insert into student(id,name,location) values (103,'Maran','TRC');
step #2:
create a partioned table:
hive> create table student_part(name string,location string) partitioned by (id int) row format delimited fields terminated by ',' stored as textFile;
step #3:
Feed data from student table to student_part table:
hive> insert into table student_part partition(id) select name,location,id from student;
Loading partition {id=102}
Loading partition {id=103}
Loading partition {id=101}
hive> describe formatted student_part;
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/batch_may.db/student_part
see the contents of that folder in hdfs:
hdfs dfs -ls /user/hive/warehouse/batch_may.db/student_part/
drwxrwxrwx - cloudera supergroup 0 2018-09-02 01:01 /user/hive/warehouse/batch_may.db/student_part/id=101
drwxrwxrwx - cloudera supergroup 0 2018-09-02 01:01 /user/hive/warehouse/batch_may.db/student_part/id=102
drwxrwxrwx - cloudera supergroup 0 2018-09-02 01:01 /user/hive/warehouse/batch_may.db/student_part/id=103
hdfs dfs -ls /user/hive/warehouse/batch_may.db/student_part/id=101
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 9 2018-09-02 01:01 /user/hive/warehouse/batch_may.db/student_part/id=101/000000_0
see the content of that file :
hdfs dfs -cat /user/hive/warehouse/batch_may.db/student_part/id=101/000000_0
sara,MDR
static partition:
----------------
create table student_part_stat (id int, name string) partitioned by (location string) row format delimited fields terminated by ',' stored as textFile;
folder cleaning:
[cloudera@quickstart ~]$ hdfs dfs -rm -r /USECASE
Deleted /USECASE
Create a folder structure in hdfs:
---------------------------
[cloudera@quickstart ~]$ hdfs dfs -mkdir /USECASE
[cloudera@quickstart ~]$ hdfs dfs -mkdir /USECASE/test
[cloudera@quickstart ~]$ hdfs dfs -mkdir /USECASE/test/student;
create .txt files in local linux
--------------------------------
mkdir sara;
$ cd sara;
ls
pwd
/home/cloudera/sara
create multiple files in local linux:
--------------------------------------
[cloudera@quickstart sara]$ cat > a001.txt
001,sara,MDR
002,Lara,WestIndies
003,Arjuna,Srilanka
^C
[cloudera@quickstart sara]$ cat > a002.txt
004,kalai,ottangadu
005,Muthu,Aathalur
006,Neels,Perai
^C
[cloudera@quickstart sara]$ cat >a003.txt
007,Anbu,Melai
008,Naga,Singa
009,Veera,Singa
^C
ls
a001.txt a002.txt a003.txt
Transfer local files into hdfs:
---------------------------------
[cloudera@quickstart sara]$ hdfs dfs -put *.* /USECASE/test/student/
m
Display the hdfs folder contents:
-----------------------------------
[cloudera@quickstart sara]$ hdfs dfs -ls /USECASE/test/student/
Found 3 items
-rw-r--r-- 1 cloudera supergroup 53 2018-09-02 01:49 /USECASE/test/student/a001.txt
-rw-r--r-- 1 cloudera supergroup 55 2018-09-02 01:49 /USECASE/test/student/a002.txt
-rw-r--r-- 1 cloudera supergroup 46 2018-09-02 01:49 /USECASE/test/student/a003.txt