AnalysisQSlinkssimple.Rmd

---
title: "Results analysis"
---
load packages read files comboine files
[1] "pq9TargRank"    "gainin30"       "spearman2"      "pearson2"      
  [5] "pMAE"           "pRMSE"          "ocvRMSE"        "RMSEutrans"    
  [9] "MAEutrans"      "date"           "algomodel"      "trgCol"        
 [13] "transTarg"      "task"           "missing"        "append"        
 [17] "transform"      "pc"             "expirament"     "fold"          
 [21] "maxfold"        "seed"           "seedit"         "foldseed"      
 [25] "RMSEmean"       "RMSEmeantrain"  "hpGen"          "time"          
 [29] "validmethod"    "tuneLength"     "cvcount"        "ignrepeats"    
 [33] "adaptivemin"    "bestTuneparams" "btp1"           "btp2"          
 [37] "btp3"           "btp4"           "btp5"           "btp6"          
 [41] "btp7"           "btp8"           "btp9"           "btp10"         
 [45] "btp11"
```{r}
col_typed <- "dddddddddccddccccccdddddddcdcddc??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"
#71 per row so 142+ more than 20?
#167
try({setwd(mainDir)})
mainDir<-getwd()
subDir<-"QSlinks_simple"
#dir.create(file.path(mainDir, subDir))
  require(readr)
require(netCoin)
require(data.table)
require(naniar) 
require(VIM)
require(ggplot2)

resultDir<-"ACEREBOUT"
#setwd(resultDir)
setwd(file.path(mainDir, resultDir))
QSlink3ACE <- read_csv("outQSlink3ACEREBOUTwindowsx64.csv",col_types = col_typed)
resetACE <- read_csv("test after which reset.csv")
names(resetACE)<-c("date","model","task","fold","cv","arch","arch2","pc")
#QSlinkImp3ACE <- read_csv("importanceQSlink3ACEREBOUTwindowsx64.csv",col_types = col_typed)
#summary(QSlink3ACE[,1:30])
 spec(QSlink3ACE)
 problems(QSlink3ACE)
 setwd(file.path(mainDir))


 
 resultDir<-"HOPPER"
#setwd(resultDir)
setwd(file.path(mainDir, resultDir))
QSlink3HOP <- read_csv("outQSlink3HOPPERwindowsx64.csv",col_types = col_typed)
resetHOP <- read_csv("test after which reset.csv")
names(resetHOP)<-c("date","model","task","fold","cv","arch","arch2","pc")
#summary(QSlink3ACE[,1:30])
 spec(QSlink3HOP)
 problems(QSlink3HOP)
 setwd(file.path(mainDir))
 
 names(QSlink3HOP)
 
  resultDir<-"LAPTOPBBQ"
#setwd(resultDir)
setwd(file.path(mainDir, resultDir))
QSlink3BBQ <- read_csv("outQSlink3LAPTOPBBQwindowsx64.csv",col_types = col_typed)
resetBBQ <- read_csv("test after which reset.csv")
names(resetBBQ)<-c("date","model","task","fold","cv","arch","arch2","pc")
#summary(QSlink3ACE[,1:30])
 spec(QSlink3BBQ)
 problems(QSlink3BBQ)
 setwd(file.path(mainDir))

```
```{r}

require(data.table)
DF<-as.data.table(QSlink3ACE)
DF1<-as.data.table(QSlink3HOP)
DF2<-as.data.table(QSlink3BBQ)

DF<-rbindlist(list(DF,DF1,DF2),fill = T)
rm(DF1,DF2,QSlink3BBQ,QSlink3HOP)

reset<-as.data.table(resetACE)
reset1<-as.data.table(resetHOP)
reset2<-as.data.table(resetBBQ)

reset<-rbindlist(list(reset,reset1,reset2),fill = T,idcol=T)
rm(reset1,reset2,resetACE,resetHOP,resetBBQ)
```

remove useless columns, convert time, check for unused columns, seed changes?
```{r}

DF$date<-as.numeric(as.POSIXct(DF$date,format = "%a %b %d %H:%M:%S %Y"))
reset$date<-as.numeric(as.POSIXct(reset$date,format = "%a %b %d %H:%M:%S %Y"))

#str(DF)
summary(DF$pq9TargRank)

#what is seedit? and seed bcause foldseed is always the same
summary(DF[,.(seed,seedit,foldseed)])#need foldseed now
unique(DF$seedit);unique(DF$foldseed);#unique(DF$seed)
unique(DF$trgCol);unique(DF$missing);unique(DF$append)
unique(DF$hpGen);unique(DF$tuneLength);unique(DF$ignrepeats)
usless<-c("trgCol","missing","append")
names(DF<-DF[,-(usless),with=F])
maxcol_bef_fetcre<-dim(DF)[2]
```

which computers ran when?
```{r}
require(scales)
#require(ggraptR)
#ggraptR()
unique(DF$pc)
DF[,datePOSIX:=as.POSIXct(DF$date, origin = '1970-01-01',tz='UTC')]
ggplot(DF[(pc %in% c("ACEREBOUT", "LAPTOPBBQ", "HOPPER"))][order(date)], aes(x=date,y=datePOSIX)) + geom_point() + facet_wrap(~ pc) + theme_grey() + theme(text=element_text(family="sans", face="plain", color="#000000", size=15, hjust=0.5, vjust=0.5)) + xlab("date") + ylab("date")# + scale_y_date(labels = date_format("%m/%d"))# +theme(axis.text.x = element_text(angle=45))
# Format : month/day

unique(reset$pc)
reset[,datePOSIX:=as.POSIXct(reset$date, origin = '1970-01-01',tz='UTC')]
ggplot(reset[(pc %in% c("ACEREBOUT", "LAPTOPBBQ", "HOPPER"))][order(date)], aes(x=date,y=datePOSIX)) + geom_point() + facet_wrap(~ .id) + theme_grey() + facet_wrap(~ pc) + theme(text=element_text(family="sans", face="plain", color="#000000", size=15, hjust=0.5, vjust=0.5)) + xlab("date") + ylab("date")

```

fold now depends on seed so fold becomes paste of fold and seed
```{r}
DF[,foldOrg:=fold]
DF[,fold:=paste0(fold,".",foldseed)]
```



Which models were somehow missed? Just checking just incase errors or 
machines did not run fast enough
Also if some hype hype parameters were way overblown
if cosole is negative its mostly ok. still means some combos that exist in system never run by any model
```{r}
DF[,keyMiss:=paste0(task,transform,transTarg,expirament)]
luMiss<-length(unique(DF$keyMiss))
DF[,unmissed:=length(unique(keyMiss)),by=algomodel]
DF[unmissed<luMiss][order(unmissed)]
max(DF$unmissed)-luMiss
plot(DF$unmissed[order(DF$unmissed)])
```



for netcoinnetwork grap write out features
these will be used later on to separate data and for more detailed views
outliers too
```{r}
getwd()
DF$imperfect <- DF$algomodel=="perfect"
DF$totFail <- is.na(DF$hpGen) & is.na(DF$RMSEutrans) & is.na(DF$spearman2) & is.na(DF$pq9TargRank)
DF$totFail <- DF$totFail |  DF$algomodel=="ignore"
DF[,noPear:=is.na(spearman2) & !totFail]
DF[,noUtrans:=is.na(RMSEutrans) & !totFail]


if(F){
DF$btp4[is.na(DF$btp4)]<-"NA"
DF$btp4[DF$btp4=="NA"]<-"4.4"
DF$btp5[is.na(DF$btp5)]<-"NA"
DF$btp3[is.na(DF$btp3)]<-"NA"
}
DF$btp3<-(DF$btp3=="3")
summary(DF$btp3)

```


plot the netcoin graphs
first focusing on perfection then droping it
"seed" btw is just a function of R version change
```{r}
require(netCoin)
#,"seedit"
set <- c("transTarg","task","transform","fold","imperfect","seed","btp3")

#essCoin <- surCoin(data = DF, variables = set, lwidth ="Haberman", lweight="Haberman", color="variable", lcolor="Haberman", ltext="Haberman", linkFilter="Haberman", repulsion=90)
#essCoin
#plot(essCoin)

set <- c("algomodel", "transTarg", "task", "transform", "fold", "pc", "totFail", "noPear", "noUtrans")


```
perfect/ideal results at section start
check for oddness, remove
```{r}

names(DF)
DF[,keyRedun:=paste0(transTarg,task,transform,pc,expirament,fold)]
Perf<-DF[(imperfect)]
DF<-DF[!(imperfect)]
summary(Perf[,1:20])

if(F){ #transtarg1 causes utrans to get numbers other than 0
halfver<-median(unique(Perf$RMSEutrans))
P1<-Perf[RMSEutrans>=halfver,]
P2<-Perf[RMSEutrans<=halfver,]
summary(P1[,1:30])
summary(P2[,1:30])
}

Perf[,.N,by=fold]
if(F){ #fold changes max possible but rarely!?
for(i in unique(Perf$fold)){
print(summary(Perf[Perf$fold==i,c(1:30),with=FALSE]))
} }

#check for redundancy against possible combinatorial explosion
Explosion<-1
for(i in which(names(Perf) %in% c("transTarg","task","transform","pc","expirament","fold"))){
  Explosion<-Explosion * dim(unique(Perf[,(i),with=F]))[1]
  print(Explosion)
}
length(unique(Perf$keyRedun))
length(Perf$keyRedun)/length(unique(Perf$keyRedun))
Perf[,.N,by=keyRedun]
save(Perf,file="routPerfects.RData")
```
infinte checked and converted to NA
```{r}
DF[,infin:=0]
for(i in 1:45){
DF$infin <- is.infinite(DF[,i,with=F][[1]]) + DF$infin
}

(infinDF<-DF[(infin>0)])

for(i in names(DF)[1:45]){#i<-2 
  nafound<-as.vector(is.infinite(DF[,i,with=F][[1]]))
  summary(nafound)
  na.replace<-NA
  DF[(nafound),(i):=na.replace]
  
  #DF[(nafound)]
}

DF[(infin>0)]#not getting everything?
infinDF
```

Percent success
```{r}
#install.packages("VIM")
require(VIM)
mice_plot <- aggr(DF[,1:20], col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(DF[,1:20]), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
mice_plot <- aggr(DF[,21:40], col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(DF[,21:40]), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
```
```{r}
require(naniar)
gg_miss_upset(DF[,1:25],nsets = 15,nintersects=20)
```


Full fails checked and removed
mlr fail state not updated algo "ignore" also a fail
```{r}
sum(DF$totFail)
dim(DF)[1]
DF[(totFail),][1:30,]

TF<-DF[(totFail),]
DF<-DF[!(totFail)]
```
```{r}
require(naniar)
gg_miss_upset(DF[,1:35],nsets = 15,nintersects=15)
require(VIM)
mice_plot <- aggr(DF[,1:35], col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(DF[,1:35]), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
```


Which models were often failed or missed again
if cosole is negative its mostly ok. still means some combos that exist in system never run by any model
```{r}
DF[,keyMiss:=paste0(task,transform,transTarg,expirament)]
luMiss<-length(unique(DF$keyMiss))
DF[,unmissed:=length(unique(keyMiss)),by=algomodel]
DF[unmissed<luMiss][order(unmissed)]
max(DF$unmissed)-luMiss
plot(DF$unmissed[order(DF$unmissed)])
```

Checks of variation inside redundant runs
If summary 0s and 1s then no NA variance
redonePQ9TF does range of 90th percentile rank by more than .02?

```{r}
DF[,keyOnerun:=paste0(expirament,task,pc,fold,transTarg,transform,algomodel,seed)]
DF[,redonee:=.N,by=keyOnerun]

DF[,redoneNA:=(sum(noPear) + sum(noUtrans))/redonee,by=keyOnerun]
summary(DF$redoneNA)
DF[,redonePQ9:=max(pq9TargRank,na.rm = T)-min(pq9TargRank,na.rm = T),by=keyOnerun]
DF[,redonePQ9TF:=redonePQ9>.02]
(specimen<-DF[(redonePQ9TF)][order(keyOnerun)][1:100])
(exems<-unique(specimen$keyOnerun))
DF[keyOnerun=="QSlink3QSlinks03_95ACEREBOUT21centernscaleSL.bartMachine403"]
```
Quick look at seconds for model to run.
```{r}
DF[,timesd:=sd(time),by=keyOnerun]
DF[,time:=mean(time),by=keyOnerun]
ggplot(data=DF,aes(x=time,y=timesd))+geom_point()
ggplot(data=DF,aes(x=log(time)))+geom_histogram()
ggplot(data=DF,aes(x=sqrt(time)))+geom_histogram()
summary(DF$time)
log(0);sqrt(0)
```


outliers moved to some more reasonable state
may be column specific
maybe cause of problem
```{r}

for(i in names(DF)[1:9]){#i<-2
  bxs <- boxplot.stats(DF[,i,with=F][[1]],coef=3.5)
  print(bxs$stats)
  nafound <- as.vector(DF[,i,with=F]<bxs$stats[[1]])
  print(summary(nafound))
  #DF[(is.na(nafound))];
  #print(DF[(nafound)])
  DF[(nafound),(i):=bxs$stats[[1]]]
  nafound <- as.vector(DF[,i,with=F]>bxs$stats[[5]])
  print(summary(nafound))
  DF[(nafound),(i):=bxs$stats[[5]]]
}
 
```

metrics that are best when low 
made negative ?inverted about he x axis?
BC code assume higher is better
after outliers to not move to  1e+10
```{r}
metrics.invert <- c("ocvRMSE", "RMSEutrans", "MAEutrans")
for(i in names(DF)[7:9]){#i<-"ocvRMSE";i<-"RMSEutrans"
  maxim<-max(DF[,i,with=F],na.rm=T)
  DF[,(i):=  (-1 * get(i)), by=1:dim(DF)[1] ]
}

```
net coicidence with everything to be removed, removed
```{r}
#DF$noPear
#DF$noUtrans
# 1 2345 67 8 910
# #get rid of this part when redoing
binnerr<-function(x){
  if(x==1) return(1)
  if(x>=2 & x<=5) return(3)
  if(x>=6 & x<=10) return(8)
}
DF[,rowNum:=seq_len(.N)]
length(unique(DF$rowNum))==dim(DF)[1]
DF[,redoneeb:=binnerr(redonee),by=rowNum]
summary(DF$redoneeb)

set <- c("algomodel", "transTarg", "task", "transform", "fold", "pc",  "noPear", "noUtrans","redonePQ9TF","redoneeb")
#essCoin <- surCoin(data = DF, variables = set, lwidth ="Haberman", lweight="Haberman", color="variable", lcolor="Haberman", ltext="Haberman", linkFilter="Haberman", repulsion=90)
#essCoin
#plot(essCoin)
```

warning! R version should be included in most keys but I just do not wan t to bother
remove completely redundant rows
```{r}

DF[,keyOnerun:=paste0(expirament,task,pc,fold,transTarg,transform,algomodel,seed)]
print("dimensions before and after removal of duplicates")
dim(DF)
uDF<-unique(DF,by = c("pq9TargRank","pearson2","pRMSE","keyOnerun"))
dim(uDF)
#uDF[order(-spearman2)][order(-pRMSE)][order(-pq9TargRank)][order(-gainin30)]
rcPs <- read_csv("recipePessimistic.csv")
problems(rcPs)
Unq<- uDF[as.data.table(rcPs), on=c("keyMiss","algomodel")]
save(Unq,file="routuDFpessimistUnique.RData")

names(uDF<-uDF[,c(1:45,maxcol_bef_fetcre:dim(uDF)[2]),with=F])
save(uDF,file="routuDFintermhold.RData") #rout used so file is not sent to github

```

```{r}
Unq[(algomodel=="blasso")]
```