Sie sind auf Seite 1von 11

----------------------------------------------

Word Count in Spark:


----------------------------------------------

val input = "file:///home/orienit/work/input/demoinput"


val output = "file:///home/orienit/work/output/wordcount"

val file = sc.textFile(input, 1)

val words = file.flatMap(line => line.split(" "))

val tuples = words.map(word => (word, 1))

val wordcount = tuples.reduceByKey((a,b) => a + b)

val sorted = wordcount.sortByKey()

sorted.saveAsTextFile(output)

----------------------------------------------
Grep Job in Spark:
----------------------------------------------

val input = "file:///home/orienit/work/input/demoinput"


val output = "file:///home/orienit/work/output/grep"

val file = sc.textFile(input, 1)

val grep = file.filter(line => line.contains("am"))

grep.saveAsTextFile(output)

----------------------------------------------
We can Run Currently Distribute Projects on 3 Runtimes
----------------------------------------------

1. MapReduce

2. Tez

3. Spark

----------------------------------------------

----------------------------------------------
SPARK SQL
----------------------------------------------

Spark SQL provides `DataFrame`.

`DataFrame` as equalant to `Table` in RDBMS

----------------------------------------------
Examples on DataFrame
----------------------------------------------
In spark-1.x we have `sqlContext`

In spark-2.x we have `sqlContext and sparkSession`

----------------------------------------------

val file = "file:///home/orienit/spark/input/student.json"

val df = spark.read.json(file)

df.schema

df.printSchema

----------------------------------------------

scala> val file = "file:///home/orienit/spark/input/student.json"


file: String = file:///home/orienit/spark/input/student.json

scala> val df = spark.read.json(file)


df: org.apache.spark.sql.DataFrame = [course: string, id: bigint ... 2 more fields]

scala> df.schema
res4: org.apache.spark.sql.types.StructType =
StructType(StructField(course,StringType,true), StructField(id,LongType,true),
StructField(name,StringType,true), StructField(year,LongType,true))

scala> df.printSchema
root
|-- course: string (nullable = true)
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- year: long (nullable = true)

scala> df.show()
+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
|hadoop| 4| sunil|2015|
| spark| 2|venkat|2016|
+------+---+------+----+

----------------------------------------------
DataFrame supports 2 types of functionalities:

1. DSL (Domain Specific Language)

2. SQL (Structure Query Language)

----------------------------------------------
Examples on DSL
----------------------------------------------

scala> df.select("name", "id").show()


+------+---+
| name| id|
+------+---+
| anil| 1|
|anvith| 5|
| dev| 6|
| raj| 3|
| sunil| 4|
|venkat| 2|
+------+---+

scala> df.select("name", "id", "year").show()


+------+---+----+
| name| id|year|
+------+---+----+
| anil| 1|2016|
|anvith| 5|2015|
| dev| 6|2015|
| raj| 3|2016|
| sunil| 4|2015|
|venkat| 2|2016|
+------+---+----+

scala> df.filter("id > 3").show()


+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
|hadoop| 4| sunil|2015|
+------+---+------+----+

scala> df.where("id > 3").show()


+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
|hadoop| 4| sunil|2015|
+------+---+------+----+

scala> df.limit(4).show()
+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
+------+---+------+----+
scala> df.groupBy()
res14: org.apache.spark.sql.RelationalGroupedDataset =
org.apache.spark.sql.RelationalGroupedDataset@37a34617

scala> df.groupBy().
agg avg count max mean min pivot sum

scala> df.groupBy().count()
res15: org.apache.spark.sql.DataFrame = [count: bigint]

scala> df.groupBy().count().show()
+-----+
|count|
+-----+
| 6|
+-----+

scala> df.groupBy("year").count().show()
+----+-----+
|year|count|
+----+-----+
|2016| 3|
|2015| 3|
+----+-----+

scala> df.groupBy("year", "course").count().show()


+----+------+-----+
|year|course|count|
+----+------+-----+
|2016| spark| 3|
|2015|hadoop| 3|
+----+------+-----+

----------------------------------------------
Examples on SQL
----------------------------------------------

scala> df.registerTempTable("student")
warning: there was one deprecation warning; re-run with -deprecation for details

scala> spark.sql("select * from student").show()


+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
|hadoop| 4| sunil|2015|
| spark| 2|venkat|2016|
+------+---+------+----+

scala> spark.sql("select name, id from student").show()


+------+---+
| name| id|
+------+---+
| anil| 1|
|anvith| 5|
| dev| 6|
| raj| 3|
| sunil| 4|
|venkat| 2|
+------+---+

scala> spark.sql("select name, id, year from student").show()


+------+---+----+
| name| id|year|
+------+---+----+
| anil| 1|2016|
|anvith| 5|2015|
| dev| 6|2015|
| raj| 3|2016|
| sunil| 4|2015|
|venkat| 2|2016|
+------+---+----+

scala> spark.sql("select * from student where id > 3").show()


+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
|hadoop| 4| sunil|2015|
+------+---+------+----+

scala> spark.sql("select count(*) from student").show()


+--------+
|count(1)|
+--------+
| 6|
+--------+

----------------------------------------------
case class Contact(cid: Int, name: String, loc: String, pincode:Int)

case class Orders(oid: Int, cid: Int, status: String)

val contact = spark.read.csv("file:///home/orienit/spark/input/contact.csv")


val cdf = contact.map(c => Contact(c(0).toString.toInt, c(1).toString,
c(2).toString, c(3).toString.toInt)).toDF()

val orders =
spark.read.option("delimiter","\t").csv("file:///home/orienit/spark/input/orders.ts
v")
val odf = orders.map(x => Orders(x(0).toString.toInt, x(1).toString.toInt,
x(2).toString)).toDF()

----------------------------------------------

----------------------------------------------
scala> case class Contact(cid: Int, name: String, loc: String, pincode:Int)
defined class Contact

scala>

scala> case class Orders(oid: Int, cid: Int, status: String)


defined class Orders

scala> val contact = spark.read.csv("file:///home/orienit/spark/input/contact.csv")


contact: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2 more
fields]

scala> val cdf = contact.map(c => Contact(c(0).toString.toInt, c(1).toString,


c(2).toString, c(3).toString.toInt)).toDF()
cdf: org.apache.spark.sql.DataFrame = [cid: int, name: string ... 2 more fields]

scala>

scala>

scala> val orders =


spark.read.option("delimiter","\t").csv("file:///home/orienit/spark/input/orders.ts
v")
orders: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 1 more
field]

scala> val odf = orders.map(x => Orders(x(0).toString.toInt, x(1).toString.toInt,


x(2).toString)).toDF()
odf: org.apache.spark.sql.DataFrame = [oid: int, cid: int ... 1 more field]

scala>

scala> cdf.show()
+---+------+----+-------+
|cid| name| loc|pincode|
+---+------+----+-------+
| 1|kalyan| hyd| 500072|
| 2|venkat| hyd| 500073|
| 3| anil| hyd| 500071|
| 4| raj| hyd| 500075|
| 5| arun| hyd| 500074|
| 6| vani|bang| 600072|
| 7| vamsi|bang| 600073|
| 8|prasad|bang| 600076|
| 9|anvith|bang| 600075|
| 10| swamy|bang| 600071|
+---+------+----+-------+

scala> odf.show()
+---+---+-------+
|oid|cid| status|
+---+---+-------+
|111| 1|success|
|112| 1|failure|
|113| 2|success|
|114| 3|success|
|115| 2|failure|
|116| 3|failure|
|117| 2|success|
|118| 5|success|
|119| 6|failure|
|120| 2|success|
|121| 3|failure|
|122| 7|success|
|123| 3|failure|
|124| 2|success|
|125| 1|failure|
|126| 5|success|
+---+---+-------+

scala> cdf.registerTempTable("contact")
warning: there was one deprecation warning; re-run with -deprecation for details

scala> odf.registerTempTable("orders")
warning: there was one deprecation warning; re-run with -deprecation for details

----------------------------------------------

scala> spark.sql("select contact.*, orders.* from contact join orders on


contact.cid = orders.cid").show()
+---+------+----+-------+---+---+-------+
|cid| name| loc|pincode|oid|cid| status|
+---+------+----+-------+---+---+-------+
| 1|kalyan| hyd| 500072|125| 1|failure|
| 1|kalyan| hyd| 500072|112| 1|failure|
| 1|kalyan| hyd| 500072|111| 1|success|
| 2|venkat| hyd| 500073|124| 2|success|
| 2|venkat| hyd| 500073|120| 2|success|
| 2|venkat| hyd| 500073|117| 2|success|
| 2|venkat| hyd| 500073|115| 2|failure|
| 2|venkat| hyd| 500073|113| 2|success|
| 3| anil| hyd| 500071|123| 3|failure|
| 3| anil| hyd| 500071|121| 3|failure|
| 3| anil| hyd| 500071|116| 3|failure|
| 3| anil| hyd| 500071|114| 3|success|
| 5| arun| hyd| 500074|126| 5|success|
| 5| arun| hyd| 500074|118| 5|success|
| 6| vani|bang| 600072|119| 6|failure|
| 7| vamsi|bang| 600073|122| 7|success|
+---+------+----+-------+---+---+-------+

scala> spark.sql("select * from student").show()


+------+---+------+----+
|course| id| name|year|
+------+---+------+----+
| spark| 1| anil|2016|
|hadoop| 5|anvith|2015|
|hadoop| 6| dev|2015|
| spark| 3| raj|2016|
|hadoop| 4| sunil|2015|
| spark| 2|venkat|2016|
+------+---+------+----+

scala> spark.sql("select * from contact").show()


+---+------+----+-------+
|cid| name| loc|pincode|
+---+------+----+-------+
| 1|kalyan| hyd| 500072|
| 2|venkat| hyd| 500073|
| 3| anil| hyd| 500071|
| 4| raj| hyd| 500075|
| 5| arun| hyd| 500074|
| 6| vani|bang| 600072|
| 7| vamsi|bang| 600073|
| 8|prasad|bang| 600076|
| 9|anvith|bang| 600075|
| 10| swamy|bang| 600071|
+---+------+----+-------+

----------------------------------------------

scala> val query = "select student.* , contact.* from student join contact on
student.name == contact.name"
query: String = select student.* , contact.* from student join contact on
student.name == contact.name

scala> spark.sql(query).show()
+------+---+------+----+---+------+----+-------+
|course| id| name|year|cid| name| loc|pincode|
+------+---+------+----+---+------+----+-------+
| spark| 1| anil|2016| 3| anil| hyd| 500071|
|hadoop| 5|anvith|2015| 9|anvith|bang| 600075|
| spark| 3| raj|2016| 4| raj| hyd| 500075|
| spark| 2|venkat|2016| 2|venkat| hyd| 500073|
+------+---+------+----+---+------+----+-------+

----------------------------------------------

scala> val hivedf = spark.sql("select * from kalyan.student")


hivedf: org.apache.spark.sql.DataFrame = [name: string, id: int ... 2 more fields]

scala> hivedf.write.json("file:///home/orienit/work/output/hive-json")

----------------------------------------------

val prop = new java.util.Properties


prop.setProperty("driver","com.mysql.jdbc.Driver")
prop.setProperty("user","root")
prop.setProperty("password","hadoop")
hivedf.write.jdbc("jdbc:mysql://localhost:3306/kalyan", "student", prop)

val rdbmsdf = spark.read.jdbc("jdbc:mysql://localhost:3306/kalyan", "student",


prop)

rdbmsdf.show()

----------------------------------------------

case class Contact(cid: Int, name: String, loc: String, pincode:Int)

val contact = spark.read.csv("file:///home/orienit/spark/input/contact.csv")

val cdf = contact.map(c => Contact(c(0).toString.toInt, c(1).toString,


c(2).toString, c(3).toString.toInt)).toDF()

----------------------------------------------

scala> case class Contact(cid: Int, name: String, loc: String, pincode:Int)
defined class Contact

scala>

scala> val contact = spark.read.csv("file:///home/orienit/spark/input/contact.csv")


contact: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2 more
fields]

scala>

scala> val cdf = contact.map(c => Contact(c(0).toString.toInt, c(1).toString,


c(2).toString, c(3).toString.toInt)).toDF()
cdf: org.apache.spark.sql.DataFrame = [cid: int, name: string ... 2 more fields]

scala> cdf.show()
+---+------+----+-------+
|cid| name| loc|pincode|
+---+------+----+-------+
| 1|kalyan| hyd| 500072|
| 2|venkat| hyd| 500073|
| 3| anil| hyd| 500071|
| 4| raj| hyd| 500075|
| 5| arun| hyd| 500074|
| 6| vani|bang| 600072|
| 7| vamsi|bang| 600073|
| 8|prasad|bang| 600076|
| 9|anvith|bang| 600075|
| 10| swamy|bang| 600071|
+---+------+----+-------+

----------------------------------------------

cdf.write.jdbc("jdbc:mysql://localhost:3306/kalyan", "contact", prop)

cdf.write.saveAsTable("kalyan.contact")
----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------

----------------------------------------------
----------------------------------------------

----------------------------------------------

Das könnte Ihnen auch gefallen