Sie sind auf Seite 1von 8

[nsha20@ip-172-16-99-122 ~]$ aws s3 cp s3://nike-emr-bin/prod/bot-identification-

wg/emr_scripts/spark/frequency_calc_rt.py -
# -*- encoding: utf-8 -*-
###################################################################################
#####
#Author: Mu Sigma
#Date: 2018-05-18
#The code is used to read in registrant data and find the frequencies associated
with them. The attributes are first_name, last_name, date_of_birth, password,
login_concat and login_length. For cases where this frequency exceeds 200 a flag is
generated for those records.

#Next step is to use the result in the similarity calculation.


#runtime for 400K records is 53 seconds for 200K records.
###################################################################################
#####
###################################################################################
#####
#Spark Session Settings

#pyspark --driver-cores 28 --driver-memory 180g --conf


spark.driver.maxResultSize=30g --executor-cores 5 --num-executors 512 --executor-
memory 5120m --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
--conf fs.s3n.multipart.uploads.enabled=true --conf
spark.yarn.executor.memoryOverhead=1536m --conf
spark.yarn.executor.extraClassPath=./ --conf
spark.scheduler.listenerbus.eventqueue.size=3500000

###################################################################################
#######################################################
###################################################################################
#######################################################
print "packages loaded"
#Importing the required libraries
import sys
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import stddev
from pyspark.sql.functions import lit
import pyspark.sql.functions as sqlfun
import datetime
###################################################################################
#######################################################

print "functions defined"


# Creating udf to add new columns
def addnewcolumn(ipdf,newcolname,listofcolumns):
userSubset = ipdf
w = Window.partitionBy(listofcolumns)
userSubset = userSubset.withColumn(newcolname,sqlfun.count("id").over(w))
return userSubset

# Creating udf to add new columns


def addrankcolumn(ipdf,newcolname,partColumns,orderColumns):
userSubset = ipdf
w = Window.partitionBy(partColumns).orderBy(orderColumns)
userSubset = userSubset.withColumn(newcolname,sqlfun.dense_rank().over(w))
return userSubset
def main(S3_bucket,IDENTITY_DB):

###################################################################################
#######################################################

###################################################################################
#######################################################
print "data loaded"
# Reading the data
user = spark.sql("select id, login, password, first_name, last_name,
locale, date_of_birth as dob, login_length, login_concat from " + IDENTITY_DB +
".pyspark_datapull_rt where locale IS NOT NULL AND Date_Of_Birth IS NOT NULL AND
first_name IS NOT NULL")
userSubset = user.toDF("id", "login", "password", "first_name",
"last_name", "locale", "dob", "login_length", "login_concat").repartition(94)

###################################################################################
#######################################################

###################################################################################
#######################################################

#keeping locales with freq >=200


userSubset=addnewcolumn(userSubset, "locale_frequency",["locale"])

userSubset=userSubset.where(userSubset.locale_frequency>=200).repartition(48)
print "checkpoint 1"

###################################################################################
#######################################################

###################################################################################
#######################################################
#add frequency of first name as first attribute
userSubset_fn=addnewcolumn(userSubset, "freq_first_name",
["first_name","locale"])

#keeping only records where first_attribute is >= 200

userSubset_fn=userSubset_fn.where(userSubset_fn.freq_first_name>=200).repartition(4
8)

#finding frequency of other attributes for a given first name


userSubset_fn=addnewcolumn(userSubset_fn, "freq_last_name",
["first_name","locale","last_name"])
userSubset_fn=addnewcolumn(userSubset_fn, "freq_dob",
["first_name","locale","dob"])
userSubset_fn=addnewcolumn(userSubset_fn, "freq_password",
["first_name","locale","password"])
userSubset_fn=addnewcolumn(userSubset_fn, "freq_login_length",
["first_name","locale","login_length"])
userSubset_fn=addnewcolumn(userSubset_fn, "freq_login_concat",
["first_name","locale","login_concat"])

#adding the flags for >= 200


userSubset_fn = userSubset_fn.withColumn("fn_flag",
sqlfun.when( (sqlfun.col("freq_first_name")>=200) , 1).otherwise(0))
userSubset_fn = userSubset_fn.withColumn("ln_flag",
sqlfun.when( (sqlfun.col("freq_last_name")>=200) , 1).otherwise(0))
userSubset_fn = userSubset_fn.withColumn("pswd_flag",
sqlfun.when( (sqlfun.col("freq_password")>=200) , 1).otherwise(0))
userSubset_fn = userSubset_fn.withColumn("dob_flag",
sqlfun.when( (sqlfun.col("freq_dob")>=200) , 1).otherwise(0))
userSubset_fn = userSubset_fn.withColumn('loglen_flag',
sqlfun.when( (sqlfun.col('freq_login_length')>=200) , 1).otherwise(0))
userSubset_fn = userSubset_fn.withColumn('concat_flag',
sqlfun.when( (sqlfun.col('freq_login_concat')>=200) , 1).otherwise(0))

#adding rank by first attribute for each locale. This will be used as a
subscript to give each pattern a unique name
userSubset_fn=addrankcolumn(userSubset_fn, "rank_first_name",["locale"],
["first_name"])

#pattern_name assignment
userSubset_fn = userSubset_fn.withColumn("first_attribute",
sqlfun.lit("firstname"))
userSubset_fn = userSubset_fn.withColumn("pattern_name",
sqlfun.concat(userSubset_fn.first_attribute, userSubset_fn.locale,
userSubset_fn.rank_first_name)).repartition(48)

#drop columns that will not be needed later

#userSubset_fn=userSubset_fn.drop("locale_frequency","rank_first_name").repartition
(48)

print "checkpoint fn"

###################################################################################
#############################################################

###################################################################################
#############################################################

#add frequency of last name as first attribute


userSubset_ln=addnewcolumn(userSubset, "freq_last_name",
["last_name","locale"])

#keeping only records where first_attribute is >= 200

userSubset_ln=userSubset_ln.where(userSubset_ln.freq_last_name>=200).repartition(48
)

#finding frequency of other attributes for a given last name


userSubset_ln=addnewcolumn(userSubset_ln, "freq_first_name",
["last_name","locale","first_name"])
userSubset_ln=addnewcolumn(userSubset_ln, "freq_dob",
["last_name","locale","dob"])
userSubset_ln=addnewcolumn(userSubset_ln, "freq_password",
["last_name","locale","password"])
userSubset_ln=addnewcolumn(userSubset_ln, "freq_login_length",
["last_name","locale","login_length"])
userSubset_ln=addnewcolumn(userSubset_ln, "freq_login_concat",
["last_name","locale","login_concat"])

#adding the flags


userSubset_ln = userSubset_ln.withColumn("fn_flag",
sqlfun.when( (sqlfun.col("freq_first_name")>=200) , 1).otherwise(0))
userSubset_ln = userSubset_ln.withColumn("ln_flag",
sqlfun.when( (sqlfun.col("freq_last_name")>=200) , 1).otherwise(0))
userSubset_ln = userSubset_ln.withColumn("pswd_flag",
sqlfun.when( (sqlfun.col("freq_password")>=200) , 1).otherwise(0))
userSubset_ln = userSubset_ln.withColumn("dob_flag",
sqlfun.when( (sqlfun.col("freq_dob")>=200) , 1).otherwise(0))
userSubset_ln = userSubset_ln.withColumn('loglen_flag',
sqlfun.when( (sqlfun.col('freq_login_length')>=200) , 1).otherwise(0))
userSubset_ln = userSubset_ln.withColumn('concat_flag',
sqlfun.when( (sqlfun.col('freq_login_concat')>=200) , 1).otherwise(0))

#adding rank by first attribute for each locale. This will be used as a
subscript to give each pattern a unique name
userSubset_ln=addrankcolumn(userSubset_ln, "rank_last_name",["locale"],
["last_name"])

#pattern_name assignment
userSubset_ln = userSubset_ln.withColumn("first_attribute",
sqlfun.lit("lastname"))
userSubset_ln = userSubset_ln.withColumn("pattern_name",
sqlfun.concat(userSubset_ln.first_attribute, userSubset_ln.locale,
userSubset_ln.rank_last_name)).repartition(48)

#drop columns that will not be needed later

#userSubset_ln=userSubset_ln.drop("locale_frequency","rank_last_name").repartition(
48)

print "checkpoint ln"

###################################################################################
#############################################################

###################################################################################
#############################################################

#add frequency of dob as first attribute


userSubset_dob=addnewcolumn(userSubset, "freq_dob",["dob","locale"])

#keeping only records where irst_attribute is >= 200

userSubset_dob=userSubset_dob.where(userSubset_dob.freq_dob>=200).repartition(48)

#finding frequency of other attributes for a given dob


userSubset_dob=addnewcolumn(userSubset_dob, "freq_first_name",
["dob","locale","first_name"])
userSubset_dob=addnewcolumn(userSubset_dob, "freq_last_name",
["dob","locale","last_name"])
userSubset_dob=addnewcolumn(userSubset_dob, "freq_password",
["dob","locale","password"])
userSubset_dob=addnewcolumn(userSubset_dob, "freq_login_length",
["dob","locale","login_length"])
userSubset_dob=addnewcolumn(userSubset_dob, "freq_login_concat",
["dob","locale","login_concat"])

#adding the flags


userSubset_dob = userSubset_dob.withColumn("fn_flag",
sqlfun.when( (sqlfun.col("freq_first_name")>=200) , 1).otherwise(0))
userSubset_dob = userSubset_dob.withColumn("ln_flag",
sqlfun.when( (sqlfun.col("freq_last_name")>=200) , 1).otherwise(0))
userSubset_dob = userSubset_dob.withColumn("pswd_flag",
sqlfun.when( (sqlfun.col("freq_password")>=200) , 1).otherwise(0))
userSubset_dob = userSubset_dob.withColumn("dob_flag",
sqlfun.when( (sqlfun.col("freq_dob")>=200) , 1).otherwise(0))
userSubset_dob = userSubset_dob.withColumn('loglen_flag',
sqlfun.when( (sqlfun.col('freq_login_length')>=200) , 1).otherwise(0))
userSubset_dob = userSubset_dob.withColumn('concat_flag',
sqlfun.when( (sqlfun.col('freq_login_concat')>=200) , 1).otherwise(0))

#adding rank by first attribute for each locale. This will be used as a
subscript to give each pattern a unique name
userSubset_dob=addrankcolumn(userSubset_dob, "rank_dob",["locale"],["dob"])

#pattern_name assignment
userSubset_dob = userSubset_dob.withColumn("first_attribute",
sqlfun.lit("dob"))
userSubset_dob = userSubset_dob.withColumn("pattern_name",
sqlfun.concat(userSubset_dob.first_attribute, userSubset_dob.locale,
userSubset_dob.rank_dob)).repartition(48)

#drop columns that will not be needed later

#userSubset_dob=userSubset_dob.drop("locale_frequency","rank_dob").repartition(48)

print "checkpoint dob"

###################################################################################
#############################################################

###################################################################################
#############################################################

#add frequency of dob as first attribute


userSubset_pswd=addnewcolumn(userSubset, "freq_password",
["password","locale"])

#keeping only records where first_attribute is >= 200


userSubset_pswd=userSubset_pswd.withColumn("freq_password", lit(0))

userSubset_pswd=userSubset_pswd.where(userSubset_pswd.freq_password>=200).repartiti
on(48)

#finding frequency of other attributes for a given password


userSubset_pswd=addnewcolumn(userSubset_pswd, "freq_first_name",
["password","locale","first_name"])
userSubset_pswd=addnewcolumn(userSubset_pswd, "freq_last_name",
["password","locale","last_name"])
userSubset_pswd=addnewcolumn(userSubset_pswd, "freq_dob",
["password","locale","dob"])
userSubset_pswd=addnewcolumn(userSubset_pswd, "freq_login_length",
["password","locale","login_length"])
userSubset_pswd=addnewcolumn(userSubset_pswd, "freq_login_concat",
["password","locale","login_concat"])

#adding the flags


userSubset_pswd = userSubset_pswd.withColumn("fn_flag",
sqlfun.when( (sqlfun.col("freq_first_name")>=200) , 1).otherwise(0))
userSubset_pswd = userSubset_pswd.withColumn("ln_flag",
sqlfun.when( (sqlfun.col("freq_last_name")>=200) , 1).otherwise(0))
userSubset_pswd = userSubset_pswd.withColumn("pswd_flag",
sqlfun.when( (sqlfun.col("freq_password")>=200) , 1).otherwise(0))
userSubset_pswd = userSubset_pswd.withColumn("dob_flag",
sqlfun.when( (sqlfun.col("freq_dob")>=200) , 1).otherwise(0))
userSubset_pswd = userSubset_pswd.withColumn('loglen_flag',
sqlfun.when( (sqlfun.col('freq_login_length')>=200) , 1).otherwise(0))
userSubset_pswd = userSubset_pswd.withColumn('concat_flag',
sqlfun.when( (sqlfun.col('freq_login_concat')>=200) , 1).otherwise(0))

#adding rank by first attribute for each locale. This will be used as a
subscript to give each pattern a unique name
userSubset_pswd=addrankcolumn(userSubset_pswd, "rank_password",["locale"],
["password"])

#pattern_name assignment
userSubset_pswd = userSubset_pswd.withColumn("first_attribute",
sqlfun.lit("password"))
userSubset_pswd = userSubset_pswd.withColumn("pattern_name",
sqlfun.concat(userSubset_pswd.first_attribute, userSubset_pswd.locale,
userSubset_pswd.rank_password)).repartition(48)

#drop columns that will not be needed later


#userSubset_pswd=userSubset_pswd.drop("locale_frequency",
"rank_password").repartition(48)

print "checkpoint password"

###################################################################################
#############################################################

#reordering columns
userSubset_fn=userSubset_fn.select("id", "first_name", "last_name",
"password", "dob", "login", "login_length", "login_concat", "locale",
"freq_first_name", "freq_last_name", "freq_password", "freq_dob",
"freq_login_length", "freq_login_concat", "fn_flag", "ln_flag", "pswd_flag",
"dob_flag", "loglen_flag", "concat_flag", "first_attribute", "pattern_name")
userSubset_ln=userSubset_ln.select("id", "first_name", "last_name",
"password", "dob", "login", "login_length", "login_concat", "locale",
"freq_first_name", "freq_last_name", "freq_password", "freq_dob",
"freq_login_length", "freq_login_concat", "fn_flag", "ln_flag", "pswd_flag",
"dob_flag", "loglen_flag", "concat_flag", "first_attribute", "pattern_name")
userSubset_pswd=userSubset_pswd.select("id", "first_name", "last_name",
"password", "dob", "login", "login_length", "login_concat", "locale",
"freq_first_name", "freq_last_name", "freq_password", "freq_dob",
"freq_login_length", "freq_login_concat", "fn_flag", "ln_flag", "pswd_flag",
"dob_flag", "loglen_flag", "concat_flag", "first_attribute", "pattern_name")
userSubset_dob=userSubset_dob.select("id", "first_name", "last_name",
"password", "dob", "login", "login_length", "login_concat", "locale",
"freq_first_name", "freq_last_name", "freq_password", "freq_dob",
"freq_login_length", "freq_login_concat", "fn_flag", "ln_flag", "pswd_flag",
"dob_flag", "loglen_flag", "concat_flag", "first_attribute", "pattern_name")

final_freq_table=userSubset_fn.union(userSubset_ln).union(userSubset_dob).union(use
rSubset_pswd)
#getting counts by pattern names
#final_freq_table=patternData
final_freq_table=addnewcolumn(final_freq_table, "patternwisecounts",
["pattern_name"]).repartition(48)

#ranking patterns in order of rank


w = Window.orderBy(final_freq_table["patternwisecounts"].desc())
final_freq_table =
final_freq_table.withColumn("patternwiserank",sqlfun.dense_rank().over(w)).repartit
ion(48)

#ordering columns and writing the same


final_freq_table.select("id", "first_name", "last_name", "password", "dob",
"login", "login_length", "login_concat", "locale", "freq_first_name",
"freq_last_name", "freq_password", "freq_dob", "freq_login_length",
"freq_login_concat", "fn_flag", "ln_flag", "pswd_flag", "dob_flag", "loglen_flag",
"concat_flag", "first_attribute", "pattern_name", "patternwisecounts",
"patternwiserank")
final_freq_table.coalesce(1).write.mode("overwrite").parquet("s3://" +
S3_bucket + "/identity/bot_test/pyspark_conversion/freq_rt/")

# ------------------------------ # 2. Spark SQL to Delete the table


if exists # ---------------------------- #
spark.sql("drop table if exists " + IDENTITY_DB + ".pyspark_freq_rt" )
#creating a hive table over the result
spark.sql("CREATE TABLE " + IDENTITY_DB + ".pyspark_freq_rt \
(id string, \
first_name string, \
last_name string, \
password string, \
dob string, \
login string, \
login_length string, \
login_concat string, \
locale string, \
freq_first_name bigint, \
freq_last_name bigint, \
freq_password bigint, \
freq_dob bigint, \
freq_login_length bigint, \
freq_login_concat bigint, \
fn_flag int, \
ln_flag int, \
pswd_flag int, \
dob_flag int, \
loglen_flag int, \
concat_flag int, \
first_attribute string, \
pattern_name string, \
patternwisecounts bigint, \
patternwiserank int) \
STORED AS PARQUET \
LOCATION 's3://" + S3_bucket +
"/identity/bot_test/pyspark_conversion/freq_rt/'")

print "checkpoint write data"

###################################################################################
######################################################################
if __name__ == "__main__":
#reading in the argumets to be passed
S3_bucket=sys.argv[1]
IDENTITY_DB = sys.argv[2]
print("The process is now running")

#setting app name and logging level

spark = SparkSession \
.builder \
.appName("frequency_calc") \
.config("spark.serializer","org.apache.spark.serializer.KryoSerializer") \
.config("spark.sql.shuffle.partitions", "12") \
.config("spark.sql.hive.convertMetastoreParquet","false") \
.enableHiveSupport().getOrCreate()

spark.conf.set("spark.sql.parquet.cacheMetadata", "false")
spark.conf.set("fs.s3.enableServerSideEncryption", "true")
spark.conf.set("hive.exec.dynamic.partition.mode","nonstrict")
spark.conf.set("hive.mapred.mode","nonstrict")
spark.conf.set("spark.sql.parquet.compression.codec", "snappy")

# spark =
SparkSession.builder.appName("frequency_calc").enableHiveSupport().getOrCreate().co
nfig("fs.s3.enableServerSideEncryption","true")
# spark.conf.set("fs.s3.enableServerSideEncryption", "true")
# spark.sparkContext.setLogLevel("WARN")
#noting start time
startTime=datetime.datetime.now()

sc = spark.sparkContext
# sqlContext = SQLContext(sc)
# hiveContext = HiveContext(sc)
# hiveContext.setConf("fs.s3.enableServerSideEncryption", "true")

#calling the main function that calculates frequency


main(S3_bucket,IDENTITY_DB)

###################################################################################
#############################################################
endTime=datetime.datetime.now()
print ("Time taken for the process")
print(endTime-startTime)

###################################################################################
#############################################################

###################################################################################
######################################################################