by Maroun Bou Sleiman
This R Markdown notebook explores electrical feeding data from EDL.
This notebook was run on Renku. The project is using the “R basic template 4.0” with these differences. Extra debian packages installed (see Dockerfile
), extra R packages(see install.R
)
Data from the EDL website was fetched using a custom web crawler and using this command renku run ./R/edl_feeding_crawler.R
. See README.rmd
for more information.
Then there is a post-processing step renku run ./R/edl_feeding_postprocess.R
. Here, some minor modifications are done to the data and a feeble attempt to geolocate these stations based on a simple search using the tidygeocoder
package.
First load some R packages and the data
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(cowplot)) # for arranging and combining plots
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(visNetwork))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidyr))
load("../data/edl_feeding_crawler_output_postprocessed.RData", verbose = T)
## Loading objects:
## feeding_all_reshaped
## feeding_all
## outlet_meta
## stations_dt
Some stats: we have 85 main stations, 1008 outlets, and 143305 data points from 166 days. Let’s check the metadata first: the stations
head(stations_dt)
Let’s check the metadata first: the substations/outlets
head(outlet_meta)
And here’s how the data looks like. 0 = no electricity, 1 = electricity
head(feeding_all)
For convenience, one can reshape the dataset to a long format (larger in size):
timecols <- paste0("time", 0:23)
feeding_all_reshaped <- feeding_all %>% gather(time, power, paste0("time", 0:23) )
feeding_all_reshaped$time <- as.numeric(gsub("time", "",feeding_all_reshaped$time))
feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
table(feeding_all_reshaped$power)
##
## 0 1
## 1483294 1956026
table(feeding_all_reshaped$subgroup)
##
## A1 A2 B1 B2
## 858744 871920 893880 814776
table(feeding_all_reshaped$substationid)
##
## 1 10 11 12 13 14 15 16 17 18 19
## 76632 98208 74280 69552 60672 70416 75480 31488 15552 28248 95760
## 2 20 21 22 23 24 25 26 27 28 29
## 68040 19680 51816 26376 34560 44928 70416 42048 38952 26760 84480
## 3 30 31 32 33 34 35 36 37 38 39
## 84888 95088 23472 6864 27552 15456 23616 35208 35208 65184 21744
## 4 40 41 42 43 44 45 46 47 48 49
## 53856 23760 11664 27984 20352 18816 66120 16848 11808 11808 51048
## 5 50 51 52 53 55 56 57 58 59 6
## 66096 47160 31296 54864 35304 3936 27456 3936 7824 15648 85536
## 60 61 62 64 65 66 67 68 69 7 70
## 7848 7824 11760 11784 15696 7824 11736 11736 34560 150696 69552
## 71 72 73 74 75 76 77 78 79 8 80
## 30720 34560 43296 20016 3936 3936 47232 61272 43296 86400 11808
## 81 82 83 84 85 86 87 9
## 27552 16992 115392 52560 15744 38880 46944 66024
Get powerplant data
power_plants <- readxl::read_xls("../data/edl_power_plant_2020.xls", skip = 2)
coords <- do.call(rbind, strsplit(gsub("( )+|( )+", "", power_plants$gps_coordinates), ","))
colnames(coords) <- c("latitude", "longitude")
coords <- apply(coords, MAR = c(1,2), as.numeric)
power_plants <- cbind(power_plants, coords)
Construct graph
graph_df <- data.frame(from = outlet_meta$global_station_id,
to = outlet_meta$global_id,
station_id = outlet_meta$station_id,
outlet_id = outlet_meta$id)
graph_df$from_lat <- stations_dt$latitude[match(outlet_meta$global_station_id, stations_dt$global_id)]
graph_df$from_lng <- stations_dt$longitude[match(outlet_meta$global_station_id, stations_dt$global_id)]
graph_df$to_lat <- outlet_meta$latitude
graph_df$to_lng <- outlet_meta$longitude
vertex_meta <- data.frame(name = unique(c(graph_df$from, graph_df$to)))
vertex_meta$type <- gsub("_(.)+", "", vertex_meta$name)
vertex_meta$internal_id <- gsub("(.)+_", "", vertex_meta$name)
vertex_meta <- split(vertex_meta, vertex_meta$type)
vertex_meta <- lapply(vertex_meta, function(x){
if(x$type[1] == "station"){
out <- x
out$label <- stations_dt$station_name[match(x$name, stations_dt$global_id)]
out
}else{
out <- x
out$label <- outlet_meta$text[match(x$name, outlet_meta$global_id)]
out
}
})
vertex_meta <- do.call(rbind, vertex_meta)
vertex_meta$shape <- plyr::mapvalues(vertex_meta$type, c("outlet", "station"), c("circle", "square"))
g <- igraph::graph_from_data_frame(graph_df, vertices = vertex_meta)
# visNetwork::visIgraph(g) %>% visIgraphLayout(layout = "layout_nicely") %>%
# visNodes(size = 10) %>%
# visOptions(highlightNearest = list(enabled = T, hover = T),
# nodesIdSelection = T)
lf <- leaflet(stations_dt) %>%
addTiles() %>%
leaflet::addAwesomeMarkers(lng = stations_dt$longitude,
lat = stations_dt$latitude,
label = stations_dt$station_name,
icon = awesomeIcons(icon = "bolt", library = "fa")) %>%
leaflet::addCircles(lng = outlet_meta$longitude, lat = outlet_meta$latitude, label = outlet_meta$text) %>%
leaflet::addAwesomeMarkers(lng = power_plants$longitude,
lat = power_plants$latitude,
label = power_plants$power_plant_group,
icon = awesomeIcons(icon = ifelse(power_plants$type=="Thermal", "industry", "tint"),
library = rep("fa", nrow(power_plants)),
iconColor = rep("black", nrow(power_plants)),
markerColor = rep("orange", nrow(power_plants)))) %>%
fitBounds(35, 33, 37, 35)
## Warning in validateCoords(lng, lat, funcName): Data contains 30 rows with either
## missing or invalid lat/lon values and will be ignored
## Warning in validateCoords(lng, lat, funcName): Data contains 979 rows with
## either missing or invalid lat/lon values and will be ignored
# which_known_path <- which(rowSums(!is.na(graph_df)[,c("from_lat", "from_lng", "to_lat", "to_lng")]) == 4)
# for(i in which_known_path){
# lf <- lf %>% leaflet::addPolylines(lng = c(graph_df$from_lng[i], graph_df$to_lng[i]), lat = c(graph_df$from_lat[i], graph_df$to_lat[i]))
# }
lf
suppressPackageStartupMessages(library(dtw))
timecols <- paste0("time", 0:23)
# make a timeseries from the first day to the last day
first_day <- lubridate::as_date(min(feeding_all$dateoffeeding))
last_day <- lubridate::as_date(max(feeding_all$dateoffeeding))
ndays <- last_day - first_day
ts_dates <- first_day + 0:ndays
ts_dates_times <- rep(ts_dates, each = 24)
ts_dates_times <- paste0(ts_dates_times, "_", timecols)
feeding_all$dateoffeeding <- lubridate::as_date(feeding_all$dateoffeeding)
feeding_all_reshaped$dateoffeeding <- lubridate::as_date(feeding_all_reshaped$dateoffeeding)
feeding_all_reshaped$power <- as.numeric(feeding_all_reshaped$power)
ts_matrix <- lapply(split(feeding_all, feeding_all$feederid), function(x){
x <- x[order(x$dateoffeeding), ]
out <- apply(x[,timecols], MAR=1, as.numeric)
outmat<- matrix(NA, nrow = length(ts_dates), ncol = length(timecols))
outmat[ts_dates %in% x$dateoffeeding,] <- out[,]
as.vector(outmat)
})
ts_matrix <- do.call(rbind, ts_matrix)
colnames(ts_matrix) <- ts_dates_times
rownames(ts_matrix) <- sapply(split(feeding_all, feeding_all$feederid), function(x){x$feederid[1]})
set.seed(123)
random_subsample <-sample(unique(feeding_all_reshaped$feedername), 50)
feeding_all_reshaped[feeding_all_reshaped$feedername %in% random_subsample, ] %>%
ggplot(aes(x = dateoffeeding_lub, y = feedername)) +
geom_tile(aes(fill = factor(power) )) +
theme_cowplot()+
geom_vline(xintercept = as_date("2020-08-04"), color = "red")
Summary view of trend:
summ <- feeding_all_reshaped %>% group_by(dateoffeeding, feederid) %>% summarise(hoursOn = sum(power == 1), hoursOff = sum(power == 0))
ggplot(summ, aes(x = dateoffeeding, y = hoursOn)) +
# geom_line(aes(group = feederid), alpha = 0.3) +
geom_smooth(fill = "white") +
ylim(0,24) +
geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
theme_cowplot() +
ylab("Hours of power per day")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# dtw_dist <- dtw::dtwDist(ts_matrix)
Hown many outlets are on vs off (per substation) at any given time? General view (smooth fit)
summ_substation <- feeding_all_reshaped %>% group_by(dateoffeeding_lub, substationid) %>% summarise(ratio_outlets_on = sum(power == 1)/length(power))
ggplot(summ_substation, aes(x = dateoffeeding_lub, y = ratio_outlets_on)) +
# geom_line(aes(group = feederid), alpha = 0.3) +
geom_smooth(fill = "white") +
ylim(0,1) +
geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
theme_cowplot() +
ylab("Ratio of outlets online") +
xlab("Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Distribution in bins of 3 months.
# set.seed(123)
# random_subsample_substation <-sample(unique(feeding_all_reshaped$substationid), 5)
summ_substation <- summ_substation %>% group_by(year(dateoffeeding_lub)) %>% mutate(interval = cut_interval(month(summ_substation$dateoffeeding_lub), length = 3))
yrs <- unique(year(summ_substation$dateoffeeding_lub))
ints <- unique(summ_substation$interval)
combs <- apply(expand.grid(yrs,ints), MAR = 1, paste, collapse = " - ")
summ_substation$interval <- factor(paste0(summ_substation$`year(dateoffeeding_lub)`, " - ", summ_substation$interval ))
summ_substation$interval <- factor(summ_substation$interval, levels = combs, ordered = T)
# unique_starts <- unique(summ_substation$interval_start)
# interval_start_label <- setNames(paste0(month(unique_starts), "-", year(unique_starts)),
# as.character(unique_starts))
#
ggplot(summ_substation, aes(x = interval, y = ratio_outlets_on)) +
# geom_line(aes(group = substationid), alpha = 0.3) +
# geom_smooth(fill = "white") +
geom_violin() +
geom_boxplot(width = 0.1) +
ylim(0,1) +
geom_vline(xintercept = as_date("2020-08-04"), color = "red") +
theme_cowplot() +
# scale_x_discrete(labels = interval_start_label) +
ylab("Ratio of outlets online (at any given time per substation)") +
xlab("Period (year-[month_start,month_end])")
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dtw_1.21-3 proxy_0.4-24 lubridate_1.7.8 leaflet_2.0.3
## [5] visNetwork_2.0.9 igraph_1.2.5 cowplot_1.1.0 forcats_0.5.0
## [9] stringr_1.4.0 dplyr_0.8.5 purrr_0.3.4 readr_1.3.1
## [13] tidyr_1.0.3 tibble_3.0.1 ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4.6 lattice_0.20-41 assertthat_0.2.1 digest_0.6.25
## [5] R6_2.4.1 cellranger_1.1.0 plyr_1.8.6 backports_1.1.7
## [9] reprex_0.3.0 evaluate_0.14 httr_1.4.1 pillar_1.4.4
## [13] rlang_0.4.6 readxl_1.3.1 rstudioapi_0.11 Matrix_1.2-18
## [17] rmarkdown_2.1 labeling_0.3 splines_4.0.0 htmlwidgets_1.5.1
## [21] munsell_0.5.0 broom_0.5.6 compiler_4.0.0 modelr_0.1.7
## [25] xfun_0.13 pkgconfig_2.0.3 mgcv_1.8-31 htmltools_0.4.0
## [29] tidyselect_1.1.0 fansi_0.4.1 crayon_1.3.4 dbplyr_1.4.3
## [33] withr_2.2.0 grid_4.0.0 nlme_3.1-147 jsonlite_1.6.1
## [37] gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0 magrittr_1.5
## [41] scales_1.1.1 cli_2.0.2 stringi_1.4.6 farver_2.0.3
## [45] fs_1.4.1 xml2_1.3.2 ellipsis_0.3.1 generics_0.0.2
## [49] vctrs_0.3.0 tools_4.0.0 glue_1.4.1 hms_0.5.3
## [53] crosstalk_1.1.0.1 yaml_2.2.1 colorspace_1.4-1 rvest_0.3.5
## [57] knitr_1.28 haven_2.2.0