Sie sind auf Seite 1von 10

## This script cleans the raw data and gives the distribution of the managed and

non-managed products within performance percentiles


set.seed(10203)
library(plyr)
library(ggplot2)
##Next two lines loads the contract data into the workspace and the data points
that are equal or smaller to zero
original.margin.volume.contracts <- read.csv("R Transfer.csv")
original.margin.volume.contracts <original.margin.volume.contracts[-c(which(original.margin.volume.contracts[, 3
] <=
1)),]
##The next line gets rid of the excel #DIV/0! error while turning the column int
o a numeric datatype
margin.volume.clean <transform(original.margin.volume.contracts, UM = as.numeric(as.character(gsub(
"#DIV/0!",
"0",
as.character(original.margin.volume.contracts[, 5])
))))
##The next line removes the first name with a regular expression
margin.volume.clean <-h
transform(margin.volume.clean,
BDM = gsub("^.*? ", "", margin.volume.clean$BDM))
margin.volume.clean <transform(margin.volume.clean,
BDM = gsub("^.*? ", "", margin.volume.clean$BDM))
margin.volume.clean <transform(margin.volume.clean, BDM = toupper(margin.volume.clean$BDM))
##Read Win Rate, remove first name and comma
original.win.rate <read.csv("Win Rate Transfer.csv", header = TRUE)
win.rate.clean <transform(original.win.rate, BDM = gsub(",", "", original.win.rate[, 1]))
win.rate.clean <transform(original.win.rate, BDM = gsub(",.", "", original.win.rate[, 1]))
win.rate.clean <- arrange(win.rate.clean, -win.rate.clean[, 5])
win.rate.clean <transform(win.rate.clean, BDM = as.character(win.rate.clean$BDM))
win.rate.clean <transform(win.rate.clean, BDM = toupper(win.rate.clean$BDM))
##create name vector
##assign the ID to the Margin Volumes Clean column
margin.volume.clean <transform(margin.volume.clean, UID = as.numeric(as.factor(margin.volume.clean[
, 1])))
##assign the managed products and non managed products to seperate tables
um <margin.volume.clean[-c(which(margin.volume.clean[, 6] == 0)),]
num <margin.volume.clean[-c(which(margin.volume.clean[, 6] == 1)),]
##order the values
library(plyr)
margin.volume.clean <- arrange(margin.volume.clean, -TM)
um <- arrange(um, -TM)
num <- arrange(num, -TM)
##Find conditional distributions. du is the vector containing the percentage of
managed product contracts in the xth percentile and above along x vector
## DNU does the same for nonm

du <- c()
x <- c(seq(from = .01, to = 1, by = .01))
for (y in x) {
du <append(du, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 1
))) / nrow(um), after = length(du))
}
library(Hmisc)
library(reshape2)
setwd("C:\\Users\\ma\\Documents\\Columbia Data")
original.statistics <- spss.get("zip-bp01.sav")
demographics <read.csv("Demographic_Statistics_By_Zip_Code.csv", header = TRUE)
merged.stats <merge(demographics,
original.statistics,
by.y = "ZIP",
by.x = "JURISDICTION.NAME")
melted.merge <melt(
merged.stats,
id = c("SECTOR", "JURISDICTION.NAME"),
measure = c("PERCENT.CONTRACTS")
)
melted.merge <transform(melted.merge, Sector.ID = as.numeric(melted.merge$SECTOR))
casted.merge <- dcast(melted.merge, JURISDICTION.NAME ~ SECTOR)
sum.industries <- c()
for (n in 1:nrow(casted.merge)) {
sum.industries <- append(sum.industries, sum(casted.merge[n,c(2:22)]), after =
length(sum.industries))
}
casted.merge <- transform(casted.merge, Total.Industry = sum.industries)
casted.merge <- merge(demographics, casted.merge, by.x = JURISDICTION.NAME)
casted.merge.mod <- casted.merge
for( n in 2:22){
casted.merge.mod[,n] <- casted.merge.mod[,n]/sum.industries
}
casted.merge.mod <dnu <- c()
for (y in x) {
dnu <append(dnu, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 0
))) / nrow(num), after = length(dnu))
}
distribution.percentile.total.margin <data.frame(
Cumulative.Performance.Percentile <- c(seq(.01, to = 1, by = .01)),
Percentage.Managed.Products <- du ,
Percentage.Non.Managed.Products <- dnu
)
duv <- c()
duum <- c()

dnuv <- c()


dnuum <- c()
um <-arrange(um, -Vo)
num <- arrange(num, -Vo)
margin.volume.clean <- arrange(margin.volume.clean, -Vo)
for(y in x){
duv <append(duv, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 1
))) / nrow(um), after = length(duv))
dnuv <append(dnuv, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 0
))) / nrow(num), after = length(dnuv))
}
distribution.percentile.Volume <data.frame(
Cumulative.Performance.Percentile <- c(seq(.01, to = 1, by = .01)),
Percentage.Managed.Products <- duv,
Percentage.Non.Managed.Products <- dnuv
)
um <- arrange(um, -UM)
num <- arrange(num, -UM)
margin.volume.clean <- arrange(margin.volume.clean, -UM)
for(y in x){
duum <append(duum, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 1
))) / nrow(um), after = length(duum))
dnuum <append(dnuum, (length(which(
margin.volume.clean[c(1:(y * nrow(margin.volume.clean))), 6] == 0
))) / nrow(num), after = length(dnuum))
}
distribution.percentile.unit.margin <data.frame(
Cumulative.Performance.Percentile <- c(seq(.01, to = 1, by = .01)),
Percentage.Managed.Products <- duum,
Percentage.Non.Managed.Products <- dnuum
)
##rank percentiel for unitmargin
##run regression statistics for the percentage of managed/nonmanaged products in
each percentile category
## Nonmanaged distribution vs total distribution look sperfectly straight so
##Control R to find out how many are in a range
##a <- ##lower bound percentile
##b <- ##upper bound percentile
##for managed product
##distribution.percentile.product.contracts[a, 2] - distribution.percentile.pr

oduct.contracts[a, 2]
## for nonmanaged
##distribution.percentile.product.contracts[a, 3] - distribution.percentile.prod
uct.contracts[a, 3]
##Convert the two managed product and nonmanaged product data frame column one t
o chracter from factors
um <- transform(um, BDM = as.character(um$BDM))
num <- transform(num, BDM = as.character(num$BDM))
##Count the number of times each person initates a managed product
z <- 0
managed.product.count.per.user <data.frame(
User = character(3132),
Count = numeric(3132),
stringsAsFactors = FALSE
)
while (z <= 3132) {
if (length(c(which(um[c(1:z), 7] == um[z, 7]))) == 1) {
managed.product.count.per.user[z,] = c(as.character(um[z, 1]), length(which(
um[, 7] == um[z, 7])))
z <- z + 1
} else{
z <- z + 1
}
}
managed.product.count.per.user <transform(managed.product.count.per.user,
Count = as.numeric(managed.product.count.per.user$Count))## ID= as.n
umeric(managed.product.count.per.user$ID))
managed.product.count.per.user <managed.product.count.per.user[-c(which(managed.product.count.per.user[, 2] ==
0)),]
managed.product.count.per.user <arrange(managed.product.count.per.user,-managed.product.count.per.user$Count)
## create and User ID vector that matches the User ID to the correct BDM in the
managed.productr.count.dataframe
UID <- c()
i <- 1
x <- 1
repeat {
if (grepl(um[i, 1], managed.product.count.per.user[x, 1]) == TRUE) {
UID <- append(UID, um[i, 7], after = length(UID))
i <- i + 1
x <- x + 1
} else{
i <- i + 1
if (i > nrow(um))
{
i <- 1
}
}
if (length(UID) == nrow(managed.product.count.per.user)) {
break
}
}
managed.product.count.per.user <transform(managed.product.count.per.user, UID = UID)
##add proportion of reports out of total.

managed.product.count.per.user.proportion <- c()


for (n in 1:nrow(managed.product.count.per.user)) {
managed.product.count.per.user.proportion <append(
managed.product.count.per.user.proportion,
managed.product.count.per.user[n, 2] / length(which(
grepl(
managed.product.count.per.user[n, 1],
margin.volume.clean[, 1]
) == TRUE
)),
after = length(managed.product.count.per.user.proportion)
)
}
managed.product.count.per.user <transform(managed.product.count.per.user, proportion.of.total.contracts = mana
ged.product.count.per.user.proportion)
##Remove Duplicate
##modified version
##create naming table
uid.table <data.frame(BDM = unique(margin.volume.clean$BDM, UID = unique(margin.volume.cl
ean[, 7])))
uid.table <transform(uid.table, UID = unique(margin.volume.clean$UID))
win.rate.clean.uid <- merge(win.rate.clean, uid.table, by.x = "BDM")
##
```
```{r, echo = FALSE}
##Basic statistics
original.statistics <data.frame(
Type = c("Non.Managed", "Managed"),
Mean.Total.Margin = c(mean(num[, 4]), mean(um[, 4])),
SD.Total.Margin = c(sd(num[, 4]), sd(um[, 4])),
Median.Total.Margin = c(median(num[, 4]), median(um[, 4])),
Mean.Unit.Margin = c(mean(num[,5]), mean(um[,5])),
SD.Unit.Margin = c(sd(num[,5]), sd(um[,5])),
Median.Unit.Margin = c(median(num[,5], median(um[,5]))),
Mean.Volume = c(mean(num[,3]), mean(um[,3])),
SD.Volume = c(sd(num[,3]), mean(um[,3])),
Median.Volume = c(median(num[,3]), median(um[,3]))
)
##Unit margin vectors with the unitmargins over 20 taken out. -20 data points fo
r managed product contracts.
unit.margin.managed.products.clean <- um[-c(which(um[, 5] > 20)), 5]
unit.margin.non.managed.products.clean <num[-c(which(num[, 5] > 20)), 5]
##To deal with the high standard deviations, I take the means of the samples.
margin.mean.of.managed.samples <- c()
volume.mean.of.managed.samples <- c()
unit.margin.mean.of.managed.samples <- c()
repeat {
margin.mean.of.managed.samples <append(margin.mean.of.managed.samples, mean(c(um[sample(1:nrow(um), 300, rep

lace =
FALSE), 4]),
after = length(margin.mean.of.ma
naged.samples)))
volume.mean.of.managed.samples <append(volume.mean.of.managed.samples, mean(c(um[sample(1:nrow(um), 300, rep
lace =
FALSE), 3]),
after = length(volume.mean.of.ma
naged.samples)))
unit.margin.mean.of.managed.samples <append(unit.margin.mean.of.managed.samples,
mean(c(um[sample(1:nrow(um), 300, replace =
FALSE), 5]),
after = length(unit.margin.mean.of.managed.samples)))
if (length(margin.mean.of.managed.samples) == 10000) {
break
}
}
margin.mean.of.non.managed.samples <- c()
volume.mean.of.non.managed.samples <- c()
unit.margin.mean.of.non.managed.samples <- c()
repeat {
margin.mean.of.non.managed.samples <append(margin.mean.of.non.managed.samples,
mean(c(num[sample(1:nrow(num), 300, replace =
FALSE), 4]),
after = length(margin.mean.of.non.managed.samples)))
volume.mean.of.non.managed.samples <append(volume.mean.of.non.managed.samples,
mean(c(num[sample(1:nrow(num), 300, replace =
FALSE), 3]),
after = length(volume.mean.of.non.managed.samples)))
unit.margin.mean.of.non.managed.samples <append(unit.margin.mean.of.non.managed.samples,
mean(c(num[sample(1:nrow(num), 300, replace =
FALSE), 5]),
after = length(
unit.margin.mean.of.non.managed.samples
)))
if (length(margin.mean.of.non.managed.samples) == 10000) {
break
}
}
sample.statistics <data.frame(
Type = c("Non.Managed", "Managed"),
Total.Margin.Mean = c(
mean(margin.mean.of.non.managed.samples),
mean(margin.mean.of.managed.samples)
),
Total.Margin.Median = c(
median(margin.mean.of.non.managed.samples),
median(margin.mean.of.managed.samples)
),

SD.Total.Margin.Mean = c(
sd(margin.mean.of.non.managed.samples),
sd(margin.mean.of.managed.samples)
),
Volume.Mean = c(
mean(volume.mean.of.non.managed.samples),
mean(volume.mean.of.managed.samples)
),
Volume.Median = c(
median(volume.mean.of.non.managed.samples),
median(volume.mean.of.managed.samples)
),
SD.Volume.Mean = c(
sd(volume.mean.of.non.managed.samples),
sd(volume.mean.of.managed.samples)
),
Unit.Margin.Mean = c(
mean(unit.margin.mean.of.non.managed.samples),
mean(unit.margin.mean.of.managed.samples)
),
Unit.Margin.Median = c(
median(unit.margin.mean.of.non.managed.samples),
median(unit.margin.mean.of.managed.samples)
),
SD.Unit.Margin.Mean = c(
sd(unit.margin.mean.of.non.managed.samples),
sd(unit.margin.mean.of.managed.samples)
)
)
##Find control for users
mean.total.margin.BDM.managed <- c()
mean.total.margin.BDM.non.managed <- c()
sd.total.margin.BDM.managed<- c()
sd.total.margin.BDM.non.managed<- c()
mean.unit.margin.BDM.managed<- c()
mean.unit.margin.BDM.non.managed<- c()
sd.unit.margin.BDM.managed<- c()
sd.unit.margin.BDM.non.managed<- c()
mean.volume.BDM.managed<- c()
mean.volume.BDM.non.managed<- c()
sd.volume.BDM.managed<- c()
sd.volume.BDM.non.managed<- c()
n<-0
##managed product list
for(n in 1:nrow(managed.product.count.per.user)){
mean.total.margin.BDM.managed <append(mean.total.margin.BDM.managed, mean(as.numeric(
margin.volume.clean[which(
grepl(managed.product.count.per.user[n,1], margin.volume.clean[,1]) == T
RUE
& margin.volume.clean$i2i == 1), 4]), na.rm = FALSE

, after= length(mean.total.margin.BDM.managed) ))
mean.total.margin.BDM.non.managed <append(mean.total.margin.BDM.non.managed, mean(as.numeric(
margin.volume.clean[which(
grepl(managed.product.count.per.user[n,1], margin.volume.clean[,1]) == T
RUE
& margin.volume.clean$i2i == 0), 4]), na.rm = FALSE)
, after= length(mean.total.margin.BDM.non.managed))
}
BDM.managed.non.managed.mean.comparison <data.frame(
BDM = managed.product.count.per.user$User,
total.margin.mean.managed = as.numeric(mean.total.margin.BDM.managed),
total.margin.mean.non.managed = as.numeric(mean.total.margin.BDM.non.managed
)
)
qw <- mean(BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - B
DM.managed.non.managed.mean.comparison$total.margin.mean.non.managed, na.rm=TRUE
)
er <- length(
which(BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - BDM.
managed.non.managed.mean.comparison$total.margin.mean.non.managed > 0)
)
##on average how much higher is the total margin for managed products than for n
onmanaged products
zw <- mean(
as.numeric(BDM.managed.non.managed.mean.comparison[which(
BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - BDM.mana
ged.non.managed.mean.comparison$total.margin.mean.non.managed > 0
), 2]) as.numeric(BDM.managed.non.managed.mean.comparison[which(
BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - BDM.ma
naged.non.managed.mean.comparison$total.margin.mean.non.managed > 0
), 3]),
na.rm = TRUE
)
##on average when the nonmanaged products win for a BDM, how much do the nonmana
ged products beat the managed products by
non.managed.bdm.mean <- mean(
as.numeric(BDM.managed.non.managed.mean.comparison[which(
BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - BDM.mana
ged.non.managed.mean.comparison$total.margin.mean.non.managed < 0
), 3]) as.numeric(BDM.managed.non.managed.mean.comparison[which(
BDM.managed.non.managed.mean.comparison$total.margin.mean.managed - BDM.ma
naged.non.managed.mean.comparison$total.margin.mean.non.managed < 0
), 2]),
na.rm = TRUE
)

```
```{r, echo = FALSE}
win.rate.clean.uid <- merge(win.rate.clean, uid.table, by.x = "BDM")
is.in.i2i <- c()
for (n in 1:nrow(win.rate.clean.uid)) {
if (is.element(win.rate.clean.uid[n, 10], managed.product.count.per.user$UID)
== TRUE) {
is.in.i2i <- append(is.in.i2i, 1, after = length(is.in.i2i))
} else{
is.in.i2i <- append(is.in.i2i, 0, after = length(is.in.i2i))
}
}
win.rate.clean.uid.i2i <transform(win.rate.clean.uid, in.i2i = is.in.i2i)
win.rate.stats <- data.frame(
Type = c("Managed", "Non Managed") ,
Mean.Win.Rate = c(
mean(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 1), 4]),
mean(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 0), 4])
),
SD.Win.Rate = c(sd(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11]
= 1), 4]),
sd(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11]
= 0), 4])),
Mean.Renewal.Rate = c(
mean(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 1), 5]),
mean(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 0), 5])
),
SD.Renewal.Rate = c(sd(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[,
1] == 1), 5]),
sd(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[,
1] == 0), 5])))

=
=

1
1

##Density plot of new win rates


##plot(
## density(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 1), 4])
, col= "green")
##
##
##lines(density(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 0),
4]), col="purple")
##
##Density plots of renewal rates
##plot(
## density(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 1), 5])
, col= "green")
##lines(density(win.rate.clean.uid.i2i[which(win.rate.clean.uid.i2i[, 11] == 0),
5]), col="purple")
##scatterplot of number of contracts vs Renewal rates

rates.of.users <- merge(win.rate.clean.uid.i2i, managed.product.count.per.user,


by.x = "UID")
##plot(rates.of.users$Count, rates.of.users$RENEWAL.RATE)
##scatterplot of proprotion of contract amanged vs Renewal Rates
rates.of.users <- merge(win.rate.clean.uid.i2i, managed.product.count.per.user,
by.x = "UID")
##plot(rates.of.users$proportion.of.total.contracts, rates.of.users$RENEWAL.RATE
)
##Regresssion Statistics of renewal rates versus the proportion of the contracts
rewnewed

Das könnte Ihnen auch gefallen