Solution below is just an illustration. There are many ways in which this problem may have been tackled. I will use a combination of regularised least squares on both handcrafted features for bags, as well as on some random Fourier features, which arise from large-scale approximations for kernel methods.
library(glmnet)
#load data
Xdf<-read.csv("~/Rwd/X.csv")
ytr<-read.csv("~/Rwd/ytr.csv")
ytst<-read.csv("~/Rwd/ytst.csv")
pinstance <- 7 #number of instance features
pbag <- 5 #number of bag features
p<-pinstance+pbag
ntr<-dim(ytr)[1]
ntst<-dim(ytst)[1]
n<-ntr+ntst
dlevel_indices<-1+100*(0:(n-1))
#scale first
Xdf[,1:p]<-scale(Xdf[,1:p])
Let us try a very simple summarisation of the bags first. We represent each bag by the means of instance attributes and concatenate the bag-level attributes. Then we apply $L_2$-regularised least squares regression using cross-validation for the regularisation parameter $\lambda$.
#############################
#linear model on just means
Phi<-data.frame(matrix(0,n,12))
for(ii in 1:n){
Phi[ii,]<-colMeans(Xdf[Xdf$bag_Id==ii,1:12])
}
Phitr<-Phi[1:ntr,]
Phitst<-Phi[(ntr+1):n,]
#use 10-fold cross-validation to assess accuracy
lmmean_cvfit<-cv.glmnet(as.matrix(Phitr),ytr$y,type.measure="mse",nfolds=10,alpha=0,lambda=2^((-30:6)/2))
plot(lmmean_cvfit)
cat("crossvalidation error (means only):", sqrt(min(lmmean_cvfit$cvm)))
lmmean_ypred<-predict(lmmean_cvfit,as.matrix(Phitst),s="lambda.min")
Now let us do the same but using also the standard deviations and the 0.2 and 0.8 quantiles of each bag.
#linear model on means+sdev+quantiles
qlo<-function(x) {quantile(x,0.2)}
qhi<-function(x) {quantile(x,0.8)}
Phi<-data.frame(matrix(0,n,33))
for(ii in 1:n){
Phi[ii,1:12]<-colMeans(Xdf[Xdf$bag_Id==ii,1:12])
Phi[ii,13:19]<-apply(Xdf[Xdf$bag_Id==ii,1:7],2,sd)
Phi[ii,20:26]<-apply(Xdf[Xdf$bag_Id==ii,1:7],2,qlo)
Phi[ii,27:33]<-apply(Xdf[Xdf$bag_Id==ii,1:7],2,qhi)
}
Phitr<-Phi[1:ntr,]
Phitst<-Phi[(ntr+1):n,]
#use 10-fold cross-validation to assess accuracy
lmmsdq_cvfit<-cv.glmnet(as.matrix(Phitr),ytr$y,type.measure="mse",nfolds=10,alpha=0,lambda=2^((-30:6)/2))
plot(lmmsdq_cvfit)
cat("crossvalidation error (means+sd+q): ", sqrt(min(lmmsdq_cvfit$cvm)))
lmmsdq_ypred<-predict(lmmsdq_cvfit,as.matrix(Phitst),s="lambda.min")
Can also try boosting decision trees on the same representation. We also cross-validate with 10 folds and try depths from 2 to 5. Looking at the RMSE in the last 10 rounds, it appears to stabilise at the level of around 0.8.
library(xgboost)
dtrain <- xgb.DMatrix(as.matrix(Phitr), label = ytr$y)
for(depth in 2:5){
cvbst <- xgb.cv(data=dtrain,nrounds=200,nfold=10,max_depth=depth,verbose=FALSE)
cat("depth ",depth,":\n")
print(cvbst$evaluation_log[190:200,])
}
bst<-xgboost(data=dtrain,nrounds=200,max_depth=4,verbose=FALSE)
bst_ypred<-predict(bst,as.matrix(Phitst))
Now, let us consider random Fourier feature (RFF) expansions. First, we generate random Fourier features for bag attributes and summarise them with the averages of RFFs (approximate kernel embeddings). We then concatenate random Fourier features for bag attributes.
##########################
#random feature embeddings
m1<-100
m2<-50
med.heur = function(X) {
##computes the median heuristic for the kernel bandwidth theta of
##the Gaussian RBF kernel exp(-0.5||x-x'||^2/theta^2)
##heuristic is: theta=median(||x-x'||)/sqrt(2)
n <- dim(X)[1]
if(n>1000){
Xs <- X[sample(1:n,1000),]
heur <- median(dist(Xs))/sqrt(2)
} else {
heur <- median(dist(X))/sqrt(2)
}
return(heur)
}
rff1 <- function(X,W1){
M<-as.matrix(X)
return(cbind(cos(M%*%W1),sin(M%*%W1)))
}
rff2 <- function(X,W2){
M<-as.matrix(X)
return(cbind(cos(M%*%W2),sin(M%*%W2)))
}
#use median heuristic bandwidths
med1<-med.heur(Xdf[,1:7])
med2<-med.heur(Xdf[dlevel_indices,8:12])
#sample frequencies
W1<-matrix(rnorm(7*m1),7,m1)
W2<-matrix(rnorm(5*m2),5,m2)
Phi<-data.frame(matrix(0,n,m1+m2))
for(ii in 1:n){
Phi1<-colMeans(rff1(Xdf[Xdf$bag_Id==ii,1:7],W1/med1))
Phi2<-rff2(Xdf[dlevel_indices[ii],8:12],W2/med2)
Phi[ii,]<-c(Phi1,Phi2)
}
Phitr<-Phi[1:ntr,]
Phitst<-Phi[(ntr+1):n,]
rffadditive_cvfit<-cv.glmnet(as.matrix(Phitr),ytr$y,type.measure="mse",nfolds=10,alpha=0, lambda=2^((-30:6)/2))
plot(rffadditive_cvfit)
cat("crossvalidation error (rff additive): ", sqrt(min(rffadditive_cvfit$cvm)))
rffadditive_ypred<-predict(rffadditive_cvfit,as.matrix(Phitst),s="lambda.min")
Now, let us try the model that captures the interactions between the instance-level attributes and the bag-level attributes. We form this by computing the outer products between the approximate kernel embeddings computed on the instance attributes and the random Fourier features for the bag attributes.
Phi<-data.frame(matrix(0,n,m1*m2))
for(ii in 1:n){
Phi1<-colMeans(rff1(Xdf[Xdf$bag_Id==ii,1:7],W1/med1))
Phi2<-rff2(Xdf[dlevel_indices[ii],8:12],W2/med2)
Phi[ii,]<-c(Phi1%*%Phi2)
}
Phitr<-Phi[1:ntr,]
Phitst<-Phi[(ntr+1):n,]
rffouter_cvfit<-cv.glmnet(as.matrix(Phitr),ytr$y,type.measure="mse",nfolds=10,alpha=0, lambda=2^((-30:6)/2))
plot(rffouter_cvfit)
cat("crossvalidation error (rff outer product): ", sqrt(min(rffouter_cvfit$cvm)))
rffouter_ypred<-predict(rffouter_cvfit,as.matrix(Phitst),s="lambda.min")
OK, we are now happy with our models, so let's compare them on the test set.
rmse<-function(ypred){sqrt(mean((ypred-ytst$y)^2))}
results<-cbind(LMMEANS=rmse(lmmean_ypred),LMMSDQ=rmse(lmmsdq_ypred),
XGBMSDQ=rmse(bst_ypred),RFFADD=rmse(rffadditive_ypred),
RFFOUTER=rmse(rffouter_ypred))
results