by Maroun Bou Sleiman

This R Markdown notebook explores electrical feeding data from EDL.

Setup

This notebook was run on Renku. The project is using the “R basic template 4.0” with these differences. Extra debian packages installed (see Dockerfile), extra R packages(see install.R)

Data from the EDL website was fetched using a custom web crawler and using this command renku run ./R/edl_feeding_crawler.R. See README.rmd for more information.

Then there is a post-processing step renku run ./R/edl_feeding_postprocess.R. Here, some minor modifications are done to the data and a feeble attempt to geolocate these stations based on a simple search using the tidygeocoder package.

Load the pre-processed data

First load some R packages and the data

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(cowplot)) # for arranging and combining plots
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(visNetwork))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidyr))


load("../data/edl_feeding_crawler_output_postprocessed.RData", verbose = T)
## Loading objects:
##   feeding_all_reshaped
##   feeding_all
##   outlet_meta
##   stations_dt

Some stats: we have 85 main stations, 1008 outlets, and 143305 data points from 166 days. Let’s check the metadata first: the stations

head(stations_dt)

Let’s check the metadata first: the substations/outlets

head(outlet_meta)

And here’s how the data looks like. 0 = no electricity, 1 = electricity

head(feeding_all)

For convenience, one can reshape the dataset to a long format (larger in size):

timecols <- paste0("time", 0:23)
feeding_all_reshaped <- feeding_all %>% gather(time, power, paste0("time", 0:23) )
feeding_all_reshaped$time <- as.numeric(gsub("time", "",feeding_all_reshaped$time))
feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
table(feeding_all_reshaped$power)
## 
##       0       1 
## 1483294 1956026
table(feeding_all_reshaped$subgroup)
## 
##     A1     A2     B1     B2 
## 858744 871920 893880 814776
table(feeding_all_reshaped$substationid)
## 
##      1     10     11     12     13     14     15     16     17     18     19 
##  76632  98208  74280  69552  60672  70416  75480  31488  15552  28248  95760 
##      2     20     21     22     23     24     25     26     27     28     29 
##  68040  19680  51816  26376  34560  44928  70416  42048  38952  26760  84480 
##      3     30     31     32     33     34     35     36     37     38     39 
##  84888  95088  23472   6864  27552  15456  23616  35208  35208  65184  21744 
##      4     40     41     42     43     44     45     46     47     48     49 
##  53856  23760  11664  27984  20352  18816  66120  16848  11808  11808  51048 
##      5     50     51     52     53     55     56     57     58     59      6 
##  66096  47160  31296  54864  35304   3936  27456   3936   7824  15648  85536 
##     60     61     62     64     65     66     67     68     69      7     70 
##   7848   7824  11760  11784  15696   7824  11736  11736  34560 150696  69552 
##     71     72     73     74     75     76     77     78     79      8     80 
##  30720  34560  43296  20016   3936   3936  47232  61272  43296  86400  11808 
##     81     82     83     84     85     86     87      9 
##  27552  16992 115392  52560  15744  38880  46944  66024

Get powerplant data

power_plants <- readxl::read_xls("../data/edl_power_plant_2020.xls", skip = 2)
coords <- do.call(rbind, strsplit(gsub("( )+|( )+", "", power_plants$gps_coordinates), ","))
colnames(coords) <- c("latitude", "longitude")
coords <- apply(coords, MAR = c(1,2), as.numeric)
power_plants <- cbind(power_plants, coords)

Construct graph

graph_df <- data.frame(from = outlet_meta$global_station_id, 
                       to = outlet_meta$global_id, 
                       station_id = outlet_meta$station_id, 
                       outlet_id = outlet_meta$id)
graph_df$from_lat <- stations_dt$latitude[match(outlet_meta$global_station_id, stations_dt$global_id)]
graph_df$from_lng <- stations_dt$longitude[match(outlet_meta$global_station_id, stations_dt$global_id)]

graph_df$to_lat <- outlet_meta$latitude
graph_df$to_lng <- outlet_meta$longitude

vertex_meta <- data.frame(name = unique(c(graph_df$from, graph_df$to)))
vertex_meta$type <- gsub("_(.)+", "", vertex_meta$name)
vertex_meta$internal_id <- gsub("(.)+_", "", vertex_meta$name)

vertex_meta <- split(vertex_meta, vertex_meta$type)
vertex_meta <- lapply(vertex_meta, function(x){
  if(x$type[1] == "station"){
    out <- x
    out$label <- stations_dt$station_name[match(x$name, stations_dt$global_id)]
    out
  }else{
    out <- x
    out$label <- outlet_meta$text[match(x$name, outlet_meta$global_id)]
    out
  }
})
vertex_meta <- do.call(rbind, vertex_meta)
vertex_meta$shape <- plyr::mapvalues(vertex_meta$type, c("outlet", "station"), c("circle", "square"))

g <- igraph::graph_from_data_frame(graph_df, vertices = vertex_meta)

# visNetwork::visIgraph(g) %>% visIgraphLayout(layout = "layout_nicely") %>%
#   visNodes(size = 10) %>%
#   visOptions(highlightNearest = list(enabled = T, hover = T), 
#              nodesIdSelection = T)


lf <- leaflet(stations_dt) %>%
  addTiles() %>%
  leaflet::addAwesomeMarkers(lng = stations_dt$longitude, 
                             lat = stations_dt$latitude, 
                             label = stations_dt$station_name, 
                             icon = awesomeIcons(icon = "bolt", library = "fa")) %>% 
  leaflet::addCircles(lng = outlet_meta$longitude, lat = outlet_meta$latitude, label = outlet_meta$text) %>% 
  leaflet::addAwesomeMarkers(lng = power_plants$longitude, 
                             lat = power_plants$latitude, 
                             label = power_plants$power_plant_group, 
                             icon = awesomeIcons(icon = ifelse(power_plants$type=="Thermal", "industry", "tint"),
                                                 library = rep("fa", nrow(power_plants)),
                                                 iconColor = rep("black", nrow(power_plants)),
                                                 markerColor = rep("orange", nrow(power_plants)))) %>%
  fitBounds(35, 33, 37, 35)
## Warning in validateCoords(lng, lat, funcName): Data contains 30 rows with either
## missing or invalid lat/lon values and will be ignored
## Warning in validateCoords(lng, lat, funcName): Data contains 979 rows with
## either missing or invalid lat/lon values and will be ignored
# which_known_path <- which(rowSums(!is.na(graph_df)[,c("from_lat", "from_lng", "to_lat", "to_lng")]) == 4)
# for(i in which_known_path){
#   lf <- lf %>% leaflet::addPolylines(lng = c(graph_df$from_lng[i], graph_df$to_lng[i]), lat = c(graph_df$from_lat[i], graph_df$to_lat[i]))
# }
lf
suppressPackageStartupMessages(library(dtw))
timecols <- paste0("time", 0:23)

# make a timeseries from the first day to the last day
first_day <- lubridate::as_date(min(feeding_all$dateoffeeding))
last_day <- lubridate::as_date(max(feeding_all$dateoffeeding))

ndays <- last_day - first_day


ts_dates <- first_day + 0:ndays
ts_dates_times <- rep(ts_dates, each = 24)
ts_dates_times <- paste0(ts_dates_times, "_", timecols)
feeding_all$dateoffeeding <- lubridate::as_date(feeding_all$dateoffeeding)
feeding_all_reshaped$dateoffeeding <- lubridate::as_date(feeding_all_reshaped$dateoffeeding)
feeding_all_reshaped$power <- as.numeric(feeding_all_reshaped$power)
ts_matrix <- lapply(split(feeding_all, feeding_all$feederid), function(x){
  x <- x[order(x$dateoffeeding), ]
  out <- apply(x[,timecols], MAR=1, as.numeric)

  outmat<- matrix(NA, nrow = length(ts_dates), ncol = length(timecols))
  outmat[ts_dates %in% x$dateoffeeding,] <- out[,]
  as.vector(outmat)
})
ts_matrix <- do.call(rbind, ts_matrix)
colnames(ts_matrix) <- ts_dates_times
rownames(ts_matrix) <- sapply(split(feeding_all, feeding_all$feederid), function(x){x$feederid[1]})



set.seed(123)
random_subsample <-sample(unique(feeding_all_reshaped$feedername), 50)


feeding_all_reshaped[feeding_all_reshaped$feedername %in% random_subsample, ] %>% 
  ggplot(aes(x = dateoffeeding_lub, y = feedername)) +
  geom_tile(aes(fill = factor(power) )) +
  theme_cowplot()+
  geom_vline(xintercept = as_date("2020-08-04"), color = "red")

Summary view of trend:

summ <- feeding_all_reshaped %>% group_by(dateoffeeding, feederid) %>% summarise(hoursOn = sum(power == 1), hoursOff = sum(power == 0))
ggplot(summ, aes(x = dateoffeeding, y = hoursOn)) + 
  # geom_line(aes(group = feederid), alpha = 0.3) +
  geom_smooth(fill = "white") +
  ylim(0,24) +
  geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
  theme_cowplot() +
  ylab("Hours of power per day")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# dtw_dist <- dtw::dtwDist(ts_matrix)

Hown many outlets are on vs off (per substation) at any given time? General view (smooth fit)

summ_substation <- feeding_all_reshaped %>% group_by(dateoffeeding_lub, substationid) %>% summarise(ratio_outlets_on = sum(power == 1)/length(power))

ggplot(summ_substation, aes(x = dateoffeeding_lub, y = ratio_outlets_on)) + 
  # geom_line(aes(group = feederid), alpha = 0.3) +
  geom_smooth(fill = "white") +
  ylim(0,1) +
  geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
  theme_cowplot() +
  ylab("Ratio of outlets online") +
  xlab("Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Distribution in bins of 3 months.

# set.seed(123)
# random_subsample_substation <-sample(unique(feeding_all_reshaped$substationid), 5)


summ_substation <- summ_substation %>% group_by(year(dateoffeeding_lub)) %>% mutate(interval = cut_interval(month(summ_substation$dateoffeeding_lub), length = 3))
yrs <- unique(year(summ_substation$dateoffeeding_lub))
ints <- unique(summ_substation$interval)
combs <- apply(expand.grid(yrs,ints), MAR = 1, paste, collapse = " - ")


summ_substation$interval <- factor(paste0(summ_substation$`year(dateoffeeding_lub)`, " - ", summ_substation$interval ))
summ_substation$interval <- factor(summ_substation$interval, levels = combs, ordered = T)

# unique_starts <- unique(summ_substation$interval_start)
# interval_start_label <- setNames(paste0(month(unique_starts), "-", year(unique_starts)),
#                                  as.character(unique_starts))
#                                  
ggplot(summ_substation, aes(x = interval, y = ratio_outlets_on)) + 
  # geom_line(aes(group = substationid), alpha = 0.3) +
  # geom_smooth(fill = "white") +
  geom_violin() +
  geom_boxplot(width = 0.1) +
  ylim(0,1) +
  geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
  theme_cowplot() +
  # scale_x_discrete(labels = interval_start_label) +
  ylab("Ratio of outlets online (at any given time per substation)") +
  xlab("Period (year-[month_start,month_end])")

R Session Information

sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] dtw_1.21-3       proxy_0.4-24     lubridate_1.7.8  leaflet_2.0.3   
##  [5] visNetwork_2.0.9 igraph_1.2.5     cowplot_1.1.0    forcats_0.5.0   
##  [9] stringr_1.4.0    dplyr_0.8.5      purrr_0.3.4      readr_1.3.1     
## [13] tidyr_1.0.3      tibble_3.0.1     ggplot2_3.3.0    tidyverse_1.3.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4.6      lattice_0.20-41   assertthat_0.2.1  digest_0.6.25    
##  [5] R6_2.4.1          cellranger_1.1.0  plyr_1.8.6        backports_1.1.7  
##  [9] reprex_0.3.0      evaluate_0.14     httr_1.4.1        pillar_1.4.4     
## [13] rlang_0.4.6       readxl_1.3.1      rstudioapi_0.11   Matrix_1.2-18    
## [17] rmarkdown_2.1     labeling_0.3      splines_4.0.0     htmlwidgets_1.5.1
## [21] munsell_0.5.0     broom_0.5.6       compiler_4.0.0    modelr_0.1.7     
## [25] xfun_0.13         pkgconfig_2.0.3   mgcv_1.8-31       htmltools_0.4.0  
## [29] tidyselect_1.1.0  fansi_0.4.1       crayon_1.3.4      dbplyr_1.4.3     
## [33] withr_2.2.0       grid_4.0.0        nlme_3.1-147      jsonlite_1.6.1   
## [37] gtable_0.3.0      lifecycle_0.2.0   DBI_1.1.0         magrittr_1.5     
## [41] scales_1.1.1      cli_2.0.2         stringi_1.4.6     farver_2.0.3     
## [45] fs_1.4.1          xml2_1.3.2        ellipsis_0.3.1    generics_0.0.2   
## [49] vctrs_0.3.0       tools_4.0.0       glue_1.4.1        hms_0.5.3        
## [53] crosstalk_1.1.0.1 yaml_2.2.1        colorspace_1.4-1  rvest_0.3.5      
## [57] knitr_1.28        haven_2.2.0