Commit 6d997225 authored by Tomas Martinovic's avatar Tomas Martinovic

initial commit

parent 2b7bec28
# Plugin hth
## Description
## Requirements
The main script that perform the computation is written in R.
Communication with Cassandra is done using JDBC driver for Cassandra.
- Rscript (tested with version 3.43)
## Dependency
In the current implementation requires the following packages
- the RJDBC, DiceDesign, DiceEval and DiceKriging package for R
```R
install.packages('RJDBC')
install.packages('DiceDesign')
install.packages('DiceEval')
install.packages('DiceKriging')
```
create_doe <- function(knobs_config_list, doe_options, knobs_names, model_container_name, metric_names, limits = NULL, algorithm = "dmax", storage_type = "CSV") {
suppressMessages(suppressPackageStartupMessages(library(DiceDesign)))
full_design <- expand.grid(knobs_config_list)
names(full_design) <- knobs_names
# Restrict design space ---------------------------------------------------
if (!is.null(limits)) {
if (any(grepl("system", limits))) {
stop("Error: No funny plays with system calls through constraints evaluation are allowed. In case you did not meant to do system call, please do not use knobs with 'system' in the name.")
}
discarded_designs <- full_design
for (limit_iter in limits) {
full_design <- full_design %>% filter(!!parse_quo(limit_iter, env = environment()))
}
discarded_designs <- discarded_designs %>% setdiff(full_design)
}
# Write model table -------------------------------------------------------
if (storage_type == "CASSANDRA"){
for (row.ind in 1:nrow(full_design))
{
set_columns <- str_c(c(knobs_names, str_c(metric_names, "_avg"), str_c(metric_names, "_std")), collapse = ", ")
set_values <- str_c(c(full_design[row.ind, ], rep(NA, length(metric_names)*2)), collapse = ", ")
dbSendUpdate(conn, str_c("INSERT INTO ", model_container_name, "(", set_columns, ") VALUES (", set_values, ")"))
}
} else if (storage_type == "CSV"){
temp_df <- full_design
for(metric_name in metric_names){
metric_avg <- str_c(metric_name, "_avg")
metric_std <- str_c(metric_name, "_std")
temp_df <- temp_df %>% mutate(!!metric_avg := NA)
temp_df <- temp_df %>% mutate(!!metric_std := NA)
}
write.table(temp_df, file = model_container_name, col.names = TRUE, row.names = FALSE, sep = ",", dec = ".")
}
# Generate points to explore ----------------------------------------------
if (algorithm == "full_factorial") {
design <- full_design
} else {
# Set number of dimensions based on the number of knobs in the input list
ndim <- length(knobs_config_list)
# Set options from the options list
nobs <- min(doe_options$nobs, nrow(full_design))
eps <- doe_options$eps
# Create design in [0,1]^n space
design <- switch(algorithm,
strauss = straussDesign(nobs, ndim, eps),
dmax = dmaxDesign(nobs, ndim, eps),
lhs = lhsDesign(nobs, ndim),
wsp = {
initial_design <- dmaxDesign(nobs, ndim, eps)$design
wspDesign(initial_design, eps)},
factorial3 = {
list(design = unique(t(combn(rep(c(1, 0, 0.5), ndim), ndim))))},
factorial5 = {
list(design = unique(t(combn(rep(c(1, 0, 0.25, 0.75, 0.5), ndim), ndim))))
})
design <- design$design
# Transform information from the knobs_config_list into matrix
knob_transform <- sapply(knobs_config_list, function(knob_config) {
knob_adjusted_max <- max(knob_config) - min(knob_config)
knob_min <- min(knob_config)
return(c(knob_min, knob_adjusted_max))
})
# Upscale design to the input space
for (i in 1:ndim) {
design[, i] <- design[, i] * knob_transform[2, i] + knob_transform[1, i]
}
# Map points to the application DOE ---------------------------------------
if (length(knobs_config_list) > 1) {
# Create matrix of all the possible knob design points
design_space_grid <- as.matrix(full_design)
# Give design_space_grid row names of 1 to nobs
row.names(design_space_grid) <- 1:dim(design_space_grid)[1]
# Find the index of the point closes to the 1-st design point
ind <- row.names(design_space_grid)[which.min(rowSums(abs(t(apply(design_space_grid, 1, function(x) {
x - design[1, ]
})))))]
for (i in 2:dim(design)[1]) {
# Find the index of the point closest to the i-th design point, discard already selected designs
ind_temp <- which.min(rowSums(abs(t(apply(design_space_grid[-as.numeric(ind), ], 1, function(x) {
x - design[i, ]
})))))
# Add row name of the point closest to the i-th design point from the grid to the index vector
ind <- c(ind, row.names(design_space_grid[-as.numeric(ind), ])[ind_temp])
}
# Subset selected design points
design <- design_space_grid[as.numeric(ind), ]
} else {
warning("Your knobs_config_list has only one knob.")
design_space_grid <- knobs_config_list[[1]]
names(design_space_grid) <- 1:length(design_space_grid)
# Find the index of the point closes to the 1-st design point
ind <- names(design_space_grid)[which.min(abs(design_space_grid - design[1, ]))]
for (i in 2:length(design)) {
# Find the index of the point closest to the i-th design point, discard already selected designs
ind_temp <- which.min(abs(design_space_grid[-as.numeric(ind)] - design[i, ]))
# Add row name of the point closest to the i-th design point from the grid to the index vector
ind <- c(ind, names(design_space_grid[-as.numeric(ind)])[ind_temp])
}
# Subset selected design points
design <- design_space_grid[as.numeric(ind)]
}
}
return(design)
}
#!/bin/bash
ENVIRONMENTAL_FILE=$1
source $ENVIRONMENTAL_FILE
################################################################################
# THIS IS THE PLUGIN ENTRY POINT
#-------------------------------------------------------------------------------
#
# The environmental file provides to this script variables to handle the
# generation of the model model, in particular it exports:
# - STORAGE_TYPE -> with the type of the supported storage, in the current
# implementation it supports only the Cassandra database
# - STORAGE_ADDRESS -> with the address of the storage
# - STORAGE_USERNAME -> optionally, the username required to authenticate
# with the storage
# - STORAGE_PASSWORD -> optionally, the password required to authenticate
# with the storage
# - OBSERVATION_CONTAINER_NAME -> the name of the container of the observation
# of the application behavior. The columns of this container
# are the following (in oder):
# - "day": number of days since epoch
# - "time": number of nanoseconds since midnight of day
# - "client_id": the string if of the client
# - "<knob>": the name of each knob of the application
# - "<feature>": the name of each feature of the application
# - "<metric>": the name of each metric of the application
# - MODEL_CONTAINER_NAME -> the input/output container of all the required
# prediction of the model. The columns of this container are
# the following:
# - "<knob>": the name of each knob of the application
# - "<feature>": the name of each feature of the application
# - "<metric_avg>": the expected mean value of each metric
# - "<metric_std>": the expected standard deviation of each metric
# NOTE: the idea is that the model should updated the <metric_*>
# fields of each row of this table.
# - KNOBS_CONTAINER_NAME -> the container with the name and type of each knob.
# The name of the columns are the following:
# - "name": the name of the software knob
# - "type": the type of the software knob
# - FEATURES_CONTAINER_NAME -> the container with the name and type of each features.
# The name of the columns are the following:
# - "name": the name of the feature
# - "type": the type of the feature
# - METRIC_NAME -> the name of the metric to predict
# - METRIC_ROOT -> the path of this folder when called by agora
#
# Is up to the plugin writer to use this script to call the tools that perform
# the prediction. The remote application handler checks the return value of
# this script to make sure that everything is fine.
# once this script is completed, the remote handler assumes that the prediction
# is compeltely done.
################################################################################
# exit if fail
set -e
# this method is unable to interact directly with a database
# therefore we use a python script to dump and update the database
# STEP 2: load data, generate the actual model and write data to cassandra
Rscript $METRIC_ROOT/main.R $METRIC_ROOT >> $METRIC_ROOT/stdout.log 2>> $METRIC_ROOT/stderr.log
get_config_list <- function(storage_type, container_name, conn = NULL) {
if (storage_type == "CASSANDRA") {
knobs_config_list <- dbGetQuery(conn, paste("SELECT values FROM ", container_name, sep = ""))
knobs_config_list <- unlist(apply(as.matrix(knobs_config_list, nrow = 1), 1, function(x) {
lapply(strsplit(substring(as.character(x), 2, nchar(as.character(x)) - 1), ", "), function(y) as.numeric(y))
}), recursive = FALSE)
} else if ( storage_type == "CSV" ) {
knobs_config_list <- read_csv(container_name) %>% pull(values)
knobs_config_list <- str_split(knobs_config_list, ";")
knobs_config_list <- map(knobs_config_list, function(x)as.numeric(x))
}
return(knobs_config_list)
}
library("tidyverse")
library("rlang")
library("magrittr")
options(scipen = 100)
map_to_input = TRUE
limits <- NULL
######################## GET THE ARGUMENTS ############################
args = commandArgs(trailingOnly = TRUE)
if (length(args) < 1)
{
stop("Error: Number of input parameters is less than 1 (Please input the root_path)", call. = FALSE)
} else if (length(args) == 1)
{
root_path <- args[1]
} else
{
root_path <- args[1]
print(paste("Warning: the following program option are ignored:", args[2:nrow(args)], collapse = ", "))
}
######################## SET WORKSPACE PATH AND VARIABLES #######################
setwd(root_path)
source("create_discrete_doe.R")
source("get_config_list.R")
configurations <- readLines(con = "agora_config.env")
configurations <- strsplit(configurations, '"')
storage_type <- configurations %>% .[grepl("STORAGE_TYPE",.)] %>% unlist %>% .[2]
storage_address <- configurations %>% .[grepl("STORAGE_ADDRESS",.)] %>% unlist %>% .[2]
application_name <- configurations %>% .[grepl("APPLICATION_NAME",.)] %>% unlist %>% .[2]
metric_name <- configurations %>% .[grepl("METRIC_NAME",.)] %>% unlist %>% .[2]
algorithm <- configurations %>% .[grepl("DOE_NAME",.)] %>% unlist %>% .[2]
doe_eps <- configurations %>% .[grepl("MINIMUM_DISTANCE",.)] %>% unlist %>% .[2] %>% as.numeric
doe_obs_per_conf <- configurations %>% .[grepl("NUMBER_OBSERVATIONS_PER_CONFIGURATION",.)] %>% unlist %>% .[2] %>% as.numeric
doe_obs_per_iter <- configurations %>% .[grepl("NUMBER_CONFIGURATIONS_PER_ITERATION",.)] %>% unlist %>% .[2] %>% as.numeric
if(any(grepl("DOE_LIMITS", configurations))){
limits <- configurations[grepl("DOE_LIMITS", configurations)]
limits <- limits[[1]][2]
limits <- strsplit(limits, ";")[[1]]
}
print(paste("Started DOE plugin. Metric:", metric_name))
########################### LOAD DATA #######################################
# CREATE THE TABLES NAMES
application_name <- gsub("/", "_", application_name)
if (storage_type == "CASSANDRA"){
suppressMessages(suppressPackageStartupMessages(library("RJDBC"))) # connect to database using JDBC codecs
knobs_container_name <- paste("margot.", application_name, "_knobs", sep = "")
features_container_name <- paste("margot.", application_name, "_features", sep = "")
observation_container_name <- paste("margot.", application_name, "_trace", sep = "")
model_container_name <- paste("margot.", application_name, "_model", sep = "")
doe_container_name <- paste("margot.", application_name, "_doe", sep = "")
metrics_container_name <- paste("margot.", application_name, "_metrics", sep = "")
# CONNECT TO CASSANDRA
driver <- JDBC("com.github.adejanovski.cassandra.jdbc.CassandraDriver", "cassandra-jdbc-wrapper-3.1.0.jar", identifier.quote = "'")
full_address_string <- paste("jdbc:cassandra://", storage_address, ":9042", sep = "")
conn <- dbConnect(driver, full_address_string)
# READ CONFIGURATION FROM CASSANDRA
knobs_names <- dbGetQuery(conn, paste("SELECT name FROM ", knobs_container_name, sep = ""))
features_names <- dbGetQuery(conn, paste("SELECT name FROM ", features_container_name, sep = ""))
metric_names <- dbGetQuery(conn, paste("SELECT name FROM ", metrics_container_name, sep = ""))
} else if (storage_type == "CSV"){
knobs_container_name <- paste(storage_address, "/", application_name, "_knobs.csv", sep = "")
features_container_name <- paste(storage_address, "/", application_name, "_features.csv", sep = "")
observation_container_name <- paste(storage_address, "/", application_name, "_trace.csv", sep = "")
model_container_name <- paste(storage_address, "/", application_name, "_model.csv", sep = "")
doe_container_name <- paste(storage_address, "/", application_name, "_doe.csv", sep = "")
metrics_container_name <- paste(storage_address, "/", application_name, "_metrics.csv", sep = "")
knobs_names <- read_csv(knobs_container_name) %>% pull(name)
metric_names <- read_csv(metrics_container_name) %>% pull(name)
conn <- NULL
} else{
stop(paste("Error: uknown $STORAGE_TYPE ", storage_type, ". Please, select storage type CASSANDRA or CSV.", sep = ""), call. = FALSE)
}
nknobs <- length(knobs_names)
if (nknobs == 0) {
stop("Error: no knobs found. Please specify the knobs.")
}
writeLines(str_c("Number of KNOBS: ", nknobs))
################################# PREPARE DOE OPTIONS #################################
# MAKE NAMES LOWERCASE FOR CASSANDRA (JUST TO BE SURE)
knobs_names <- str_to_lower(knobs_names)
metric_name <- str_to_lower(metric_name)
# GET GRID CONFIGURATION
knobs_config_list <- get_config_list(storage_type, knobs_container_name, conn)
# SET THE DOE OPTIONS
doe_options <- list(nobs = doe_obs_per_iter, eps = doe_eps)
############################ CREATE DOE ############################
doe_design <- create_doe(knobs_config_list, doe_options, knobs_names, model_container_name, metric_names, limits, algorithm, storage_type)
# ADD COUNTER COLUMN
if (is.null(doe_design)){
doe_design <- matrix(c(doe_design, rep(doe_obs_per_conf, length(doe_design))), ncol = 2)
} else{
doe_design <- cbind(doe_design, doe_obs_per_conf)
}
doe_names <- c(knobs_names, "counter")
names(doe_design) <- doe_names
################################# WRITE DOE #################################
if (storage_type == "CASSANDRA")
{
for (row.ind in 1:nrow(doe_design))
{
set_columns <- paste(doe_names, sep = "", collapse = ", ")
set_values <- paste(doe_design[row.ind, ], sep = "", collapse = ", ")
dbSendUpdate(conn, paste("INSERT INTO ", doe_container_name, "(", set_columns, ") VALUES (", set_values, ")", sep = ""))
}
} else if (storage_type == "CSV")
{
write.table(doe_design, file = doe_container_name, col.names = FALSE, row.names = FALSE, sep = ",", dec = ".", append = TRUE)
}
print("Wrote new DOE configurations")
q(save = "no")
# Plugin hth
## Description
## Requirements
The main script that perform the computation is written in R.
Communication with Cassandra is done using JDBC driver for Cassandra.
- Rscript (tested with version 3.43)
## Dependency
In the current implementation requires the following packages
- the RJDBC, DiceDesign, DiceEval and DiceKriging package for R
```R
install.packages('RJDBC')
install.packages('DiceDesign')
install.packages('DiceEval')
install.packages('DiceKriging')
```
#!/bin/bash
ENVIRONMENTAL_FILE=$1
source $ENVIRONMENTAL_FILE
################################################################################
# THIS IS THE PLUGIN ENTRY POINT
#-------------------------------------------------------------------------------
#
# The environmental file provides to this script variables to handle the
# generation of the model model, in particular it exports:
# - STORAGE_TYPE -> with the type of the supported storage, in the current
# implementation it supports only the Cassandra database
# - STORAGE_ADDRESS -> with the address of the storage
# - STORAGE_USERNAME -> optionally, the username required to authenticate
# with the storage
# - STORAGE_PASSWORD -> optionally, the password required to authenticate
# with the storage
# - OBSERVATION_CONTAINER_NAME -> the name of the container of the observation
# of the application behavior. The columns of this container
# are the following (in oder):
# - "day": number of days since epoch
# - "time": number of nanoseconds since midnight of day
# - "client_id": the string if of the client
# - "<knob>": the name of each knob of the application
# - "<feature>": the name of each feature of the application
# - "<metric>": the name of each metric of the application
# - MODEL_CONTAINER_NAME -> the input/output container of all the required
# prediction of the model. The columns of this container are
# the following:
# - "<knob>": the name of each knob of the application
# - "<feature>": the name of each feature of the application
# - "<metric_avg>": the expected mean value of each metric
# - "<metric_std>": the expected standard deviation of each metric
# NOTE: the idea is that the model should updated the <metric_*>
# fields of each row of this table.
# - KNOBS_CONTAINER_NAME -> the container with the name and type of each knob.
# The name of the columns are the following:
# - "name": the name of the software knob
# - "type": the type of the software knob
# - FEATURES_CONTAINER_NAME -> the container with the name and type of each features.
# The name of the columns are the following:
# - "name": the name of the feature
# - "type": the type of the feature
# - METRIC_NAME -> the name of the metric to predict
# - METRIC_ROOT -> the path of this folder when called by agora
#
# Is up to the plugin writer to use this script to call the tools that perform
# the prediction. The remote application handler checks the return value of
# this script to make sure that everything is fine.
# once this script is completed, the remote handler assumes that the prediction
# is compeltely done.
################################################################################
# exit if fail
set -e
# this method is unable to interact directly with a database
# therefore we use a python script to dump and update the database
# STEP 2: load data, generate the actual model and write data to cassandra
Rscript $METRIC_ROOT/model.R $METRIC_ROOT >> $METRIC_ROOT/stdout.log 2>> $METRIC_ROOT/stderr.log
get_config_list <- function(storage_type, container_name, conn = NULL) {
if (storage_type == "CASSANDRA") {
knobs_config_list <- dbGetQuery(conn, paste("SELECT values FROM ", container_name, sep = ""))
knobs_config_list <- unlist(apply(as.matrix(knobs_config_list, nrow = 1), 1, function(x) {
lapply(strsplit(substring(as.character(x), 2, nchar(as.character(x)) - 1), ", "), function(y) as.numeric(y))
}), recursive = FALSE)
} else if ( storage_type == "CSV" ) {
knobs_config_list <- read_csv(container_name) %>% pull(values)
knobs_config_list <- str_split(knobs_config_list, ";")
knobs_config_list <- map(knobs_config_list, function(x)as.numeric(x))
}
return(knobs_config_list)
}
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment