From 1bd2b1ff5616ac45bc888ae911bff6cfde6fa458 Mon Sep 17 00:00:00 2001
From: Maroun Bou Sleiman <maroun.bousleiman@epfl.ch>
Date: Mon, 14 Sep 2020 07:43:34 +0000
Subject: [PATCH] added update capability, re-introduced parallelism

---
 R/edl_feeding_crawler.R                     | 76 ++++++++++++++-------
 R/edl_feeding_postprocess.R                 |  3 +-
 README.md                                   |  7 +-
 data/feeding_all_reshaped.csv               |  3 -
 data/feeding_all_reshaped_postprocessed.csv |  3 -
 install.R                                   |  2 +-
 6 files changed, 61 insertions(+), 33 deletions(-)
 delete mode 100644 data/feeding_all_reshaped.csv
 delete mode 100644 data/feeding_all_reshaped_postprocessed.csv

diff --git a/R/edl_feeding_crawler.R b/R/edl_feeding_crawler.R
index b2c327c..c385409 100644
--- a/R/edl_feeding_crawler.R
+++ b/R/edl_feeding_crawler.R
@@ -1,17 +1,47 @@
+#' Usage:
+#' Initial run
+#' Rscript ./R/edl_feeding_crawler.R
+#' To update data
+#' Rscript ./R/edl_feeding_crawler.R update
+
 library(httr)
 library(xml2)
 library(dplyr)
 library(stringi)
 library(reshape)
 library(reshape2)
-# library(parallel)
+library(parallel)
+library(pbapply)
+pboptions(type = "txt")
+ncores <- ifelse(detectCores() >= 6, 6, detecCores())
+
+args <- commandArgs(trailingOnly = TRUE)
+
+if(length(args) == 0){
+  update = FALSE
+}else{
+  update = ifelse(args[1] == "update", TRUE, FALSE)
+}
+
+if(update){
+  load("./data/edl_feeding_crawler_output.RData", verbose = T)
+  feeding_all_old <- feeding_all
+  outlet_meta_old <- outlet_meta
+  stations_dt_old <- stations_dt
+  date1 <- max(lubridate::as_date(feeding_all_old$dateoffeeding) + 1)
+}else{
+  date1 <- "2008-10-20"
+}
+
+date2 <- as.character(lubridate::today())
+
 # get station names
 stations <- read_xml("./data/substations.xml") # manually derived by Maroun Bou Sleiman
 stations_dt <- data.frame(station_name = xml2::xml_text(xml2::xml_children(stations)),
                           station_id = unlist(xml2::xml_attrs(xml2::xml_children(stations))), stringsAsFactors = F)
 
 # get outlets and associated metadata of stations
-outlet_meta <- lapply(stations_dt$station_id, function(x){
+outlet_meta <- pblapply(X=stations_dt$station_id, FUN = function(x){
   print(x)
   bd <- list(mode = "feeders-sel2", id = x)
   res <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd, verbose())
@@ -21,45 +51,45 @@ outlet_meta <- lapply(stations_dt$station_id, function(x){
   Encoding(substations_metadata$text) <- "UTF-8"
   substations_metadata$station_id <- x
   substations_metadata
-} )
+}, cl = ncores )
 
 outlet_meta <- do.call(rbind, outlet_meta)
-head(outlet_meta)
 
-# get the supply data
-date1 <- "2008-10-20"
-date2 <- as.character(lubridate::today())
-
-
-feeding_all <- lapply(unique(unique(outlet_meta$id)),function(x){
+feeding_all <-pblapply(X=unique(outlet_meta$id), FUN = function(x){
   print(x)
   bd2 <- list(mode = "load", actpre = "act", a_feeders = x, d1 = date1, d2=date2)
   res2 <-  POST("http://www.edl.gov.lb/feedingdata.php", body = bd2)
-  
   results <- content(res2, as = "parsed")
   results <- jsonlite::fromJSON(xml_text(xml2::xml_child(results,1), trim = T))$list_feeders
   results
-})
+}, cl = ncores)
 feeding_all <- do.call(rbind, feeding_all)
 
-timecols <- paste0("time", 0:23)
-feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long",  v.names = "power")
 
-library(ggplot2)
-library(lubridate)
-feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
-hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
+if(update){
+  feeding_all <- rbind(feeding_all_old, feeding_all)
+}
 
-table(feeding_all_reshaped$power)
-table(feeding_all_reshaped$subgroup)
-table(feeding_all_reshaped$substationid)
 
-save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData")
+save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData")
 #alternatively, save individual files in csv
 write.csv(feeding_all, file = "./data/feeding_all.csv")
-write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped.csv")
 write.csv(outlet_meta, file = "./data/outlet_meta.csv")
 write.csv(stations_dt, file = "./data/stations_dt.csv")
+
+
+
+# library(ggplot2)
+# library(lubridate)
+# timecols <- paste0("time", 0:23)
+# feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long",  v.names = "power")
+# feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
+# hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
+# table(feeding_all_reshaped$power)
+# table(feeding_all_reshaped$subgroup)
+# table(feeding_all_reshaped$substationid)
+
+
 # p <- ggplot(feeding_all_reshaped, aes(x = dateoffeeding_lub, y = feedername, fill = power)) +
 #   geom_tile() + scale_fill_manual(values = c("0" = "red", "1" = "green")) +
 #   facet_wrap(substationname~., drop= T, ncol = 2)
diff --git a/R/edl_feeding_postprocess.R b/R/edl_feeding_postprocess.R
index 022fd24..2510c8c 100644
--- a/R/edl_feeding_postprocess.R
+++ b/R/edl_feeding_postprocess.R
@@ -23,9 +23,8 @@ outlet_meta$longitude <- outlet_geocodes$longitude
 
 
 
-save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData")
+save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData")
 #alternatively, save individual files in csv
 write.csv(feeding_all, file = "./data/feeding_all_postprocessed.csv")
-write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped_postprocessed.csv")
 write.csv(outlet_meta, file = "./data/outlet_meta_postprocessed.csv")
 write.csv(stations_dt, file = "./data/stations_dt_postprocessed.csv")
\ No newline at end of file
diff --git a/README.md b/README.md
index 2286407..894d80a 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ This project uses openly available data to assess this sector. The feeding data
 Basic R 4.0 template. See the `Dockerfile` and `install.R` files for modifications from the base template.
 
 ## Steps
-1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs:
+1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs. 
 
 ```
 ## without Renku
@@ -18,6 +18,11 @@ Rscript ./R/edl_feeding_postprocess.R
 ## with Renku
 renku run Rscript ./R/edl_feeding_crawler.R
 renku run Rscript ./R/edl_feeding_postprocess.R
+
+## If you append `update` as an argument, the script will first read available data and complete it with more recent data
+Rscript ./R/edl_feeding_crawler.R update
+Rscript ./R/edl_feeding_postprocess.R update
+
 ```
 By default the script will fetch the data from 20 Oct 2008 to "today". You'll need to modify the script to change the dates. Earlies date (I think) is 20 Oct 2008.
 
diff --git a/data/feeding_all_reshaped.csv b/data/feeding_all_reshaped.csv
deleted file mode 100644
index 9b28930..0000000
--- a/data/feeding_all_reshaped.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e
-size 524388997
diff --git a/data/feeding_all_reshaped_postprocessed.csv b/data/feeding_all_reshaped_postprocessed.csv
deleted file mode 100644
index 9b28930..0000000
--- a/data/feeding_all_reshaped_postprocessed.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e
-size 524388997
diff --git a/install.R b/install.R
index a40c8c2..01f057e 100644
--- a/install.R
+++ b/install.R
@@ -1 +1 @@
-install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw"))
\ No newline at end of file
+install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw", "pbapply"))
\ No newline at end of file
-- 
GitLab