Beruflich Dokumente
Kultur Dokumente
sorted.saveAsTextFile(output)
----------------------------------------------
Grep Job in Spark:
----------------------------------------------
grep.saveAsTextFile(output)
----------------------------------------------
We can Run Currently Distribute Projects on 3 Runtimes
----------------------------------------------
1. MapReduce
2. Tez
3. Spark
----------------------------------------------
----------------------------------------------
SPARK SQL
----------------------------------------------
----------------------------------------------
Examples on DataFrame
----------------------------------------------
In spark-1.x we have `sqlContext`
----------------------------------------------
val df = spark.read.json(file)
df.schema
df.printSchema
----------------------------------------------
scala> df.schema
res4: org.apache.spark.sql.types.StructType =
StructType(StructField(course,StringType,true), StructField(id,LongType,true),
StructField(name,StringType,true), StructField(year,LongType,true))
scala> df.printSchema
root
|-- course: string (nullable = true)
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- year: long (nullable = true)
scala> df.show()
+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
|hadoop| 4| sunil|2015|
| spark| 2|venkat|2016|
+------+---+------+----+
----------------------------------------------
DataFrame supports 2 types of functionalities:
----------------------------------------------
Examples on DSL
----------------------------------------------
scala> df.limit(4).show()
+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
+------+---+------+----+
scala> df.groupBy()
res14: org.apache.spark.sql.RelationalGroupedDataset =
org.apache.spark.sql.RelationalGroupedDataset@37a34617
scala> df.groupBy().
agg avg count max mean min pivot sum
scala> df.groupBy().count()
res15: org.apache.spark.sql.DataFrame = [count: bigint]
scala> df.groupBy().count().show()
+-----+
|count|
+-----+
| 6|
+-----+
scala> df.groupBy("year").count().show()
+----+-----+
|year|count|
+----+-----+
|2016| 3|
|2015| 3|
+----+-----+
----------------------------------------------
Examples on SQL
----------------------------------------------
scala> df.registerTempTable("student")
warning: there was one deprecation warning; re-run with -deprecation for details
----------------------------------------------
case class Contact(cid: Int, name: String, loc: String, pincode:Int)
val orders =
spark.read.option("delimiter","\t").csv("file:///home/orienit/spark/input/orders.ts
v")
val odf = orders.map(x => Orders(x(0).toString.toInt, x(1).toString.toInt,
x(2).toString)).toDF()
----------------------------------------------
----------------------------------------------
scala> case class Contact(cid: Int, name: String, loc: String, pincode:Int)
defined class Contact
scala>
scala>
scala>
scala>
scala> cdf.show()
+---+------+----+-------+
|cid| name| loc|pincode|
+---+------+----+-------+
| 1|kalyan| hyd| 500072|
| 2|venkat| hyd| 500073|
| 3| anil| hyd| 500071|
| 4| raj| hyd| 500075|
| 5| arun| hyd| 500074|
| 6| vani|bang| 600072|
| 7| vamsi|bang| 600073|
| 8|prasad|bang| 600076|
| 9|anvith|bang| 600075|
| 10| swamy|bang| 600071|
+---+------+----+-------+
scala> odf.show()
+---+---+-------+
|oid|cid| status|
+---+---+-------+
|111| 1|success|
|112| 1|failure|
|113| 2|success|
|114| 3|success|
|115| 2|failure|
|116| 3|failure|
|117| 2|success|
|118| 5|success|
|119| 6|failure|
|120| 2|success|
|121| 3|failure|
|122| 7|success|
|123| 3|failure|
|124| 2|success|
|125| 1|failure|
|126| 5|success|
+---+---+-------+
scala> cdf.registerTempTable("contact")
warning: there was one deprecation warning; re-run with -deprecation for details
scala> odf.registerTempTable("orders")
warning: there was one deprecation warning; re-run with -deprecation for details
----------------------------------------------
----------------------------------------------
scala> val query = "select student.* , contact.* from student join contact on
student.name == contact.name"
query: String = select student.* , contact.* from student join contact on
student.name == contact.name
scala> spark.sql(query).show()
+------+---+------+----+---+------+----+-------+
|course| id| name|year|cid| name| loc|pincode|
+------+---+------+----+---+------+----+-------+
| spark| 1| anil|2016| 3| anil| hyd| 500071|
|hadoop| 5|anvith|2015| 9|anvith|bang| 600075|
| spark| 3| raj|2016| 4| raj| hyd| 500075|
| spark| 2|venkat|2016| 2|venkat| hyd| 500073|
+------+---+------+----+---+------+----+-------+
----------------------------------------------
scala> hivedf.write.json("file:///home/orienit/work/output/hive-json")
----------------------------------------------
rdbmsdf.show()
----------------------------------------------
----------------------------------------------
scala> case class Contact(cid: Int, name: String, loc: String, pincode:Int)
defined class Contact
scala>
scala>
scala> cdf.show()
+---+------+----+-------+
|cid| name| loc|pincode|
+---+------+----+-------+
| 1|kalyan| hyd| 500072|
| 2|venkat| hyd| 500073|
| 3| anil| hyd| 500071|
| 4| raj| hyd| 500075|
| 5| arun| hyd| 500074|
| 6| vani|bang| 600072|
| 7| vamsi|bang| 600073|
| 8|prasad|bang| 600076|
| 9|anvith|bang| 600075|
| 10| swamy|bang| 600071|
+---+------+----+-------+
----------------------------------------------
cdf.write.saveAsTable("kalyan.contact")
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------
----------------------------------------------