From 1bd2b1ff5616ac45bc888ae911bff6cfde6fa458 Mon Sep 17 00:00:00 2001 From: Maroun Bou Sleiman Date: Mon, 14 Sep 2020 07:43:34 +0000 Subject: [PATCH] added update capability, re-introduced parallelism --- R/edl_feeding_crawler.R | 76 ++++++++++++++------- R/edl_feeding_postprocess.R | 3 +- README.md | 7 +- data/feeding_all_reshaped.csv | 3 - data/feeding_all_reshaped_postprocessed.csv | 3 - install.R | 2 +- 6 files changed, 61 insertions(+), 33 deletions(-) delete mode 100644 data/feeding_all_reshaped.csv delete mode 100644 data/feeding_all_reshaped_postprocessed.csv diff --git a/R/edl_feeding_crawler.R b/R/edl_feeding_crawler.R index b2c327c..c385409 100644 --- a/R/edl_feeding_crawler.R +++ b/R/edl_feeding_crawler.R @@ -1,17 +1,47 @@ +#' Usage: +#' Initial run +#' Rscript ./R/edl_feeding_crawler.R +#' To update data +#' Rscript ./R/edl_feeding_crawler.R update + library(httr) library(xml2) library(dplyr) library(stringi) library(reshape) library(reshape2) -# library(parallel) +library(parallel) +library(pbapply) +pboptions(type = "txt") +ncores <- ifelse(detectCores() >= 6, 6, detecCores()) + +args <- commandArgs(trailingOnly = TRUE) + +if(length(args) == 0){ + update = FALSE +}else{ + update = ifelse(args[1] == "update", TRUE, FALSE) +} + +if(update){ + load("./data/edl_feeding_crawler_output.RData", verbose = T) + feeding_all_old <- feeding_all + outlet_meta_old <- outlet_meta + stations_dt_old <- stations_dt + date1 <- max(lubridate::as_date(feeding_all_old$dateoffeeding) + 1) +}else{ + date1 <- "2008-10-20" +} + +date2 <- as.character(lubridate::today()) + # get station names stations <- read_xml("./data/substations.xml") # manually derived by Maroun Bou Sleiman stations_dt <- data.frame(station_name = xml2::xml_text(xml2::xml_children(stations)), station_id = unlist(xml2::xml_attrs(xml2::xml_children(stations))), stringsAsFactors = F) # get outlets and associated metadata of stations -outlet_meta <- lapply(stations_dt$station_id, function(x){ +outlet_meta <- pblapply(X=stations_dt$station_id, FUN = function(x){ print(x) bd <- list(mode = "feeders-sel2", id = x) res <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd, verbose()) @@ -21,45 +51,45 @@ outlet_meta <- lapply(stations_dt$station_id, function(x){ Encoding(substations_metadata$text) <- "UTF-8" substations_metadata$station_id <- x substations_metadata -} ) +}, cl = ncores ) outlet_meta <- do.call(rbind, outlet_meta) -head(outlet_meta) -# get the supply data -date1 <- "2008-10-20" -date2 <- as.character(lubridate::today()) - - -feeding_all <- lapply(unique(unique(outlet_meta$id)),function(x){ +feeding_all <-pblapply(X=unique(outlet_meta$id), FUN = function(x){ print(x) bd2 <- list(mode = "load", actpre = "act", a_feeders = x, d1 = date1, d2=date2) res2 <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd2) - results <- content(res2, as = "parsed") results <- jsonlite::fromJSON(xml_text(xml2::xml_child(results,1), trim = T))$list_feeders results -}) +}, cl = ncores) feeding_all <- do.call(rbind, feeding_all) -timecols <- paste0("time", 0:23) -feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power") -library(ggplot2) -library(lubridate) -feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding)) -hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time +if(update){ + feeding_all <- rbind(feeding_all_old, feeding_all) +} -table(feeding_all_reshaped$power) -table(feeding_all_reshaped$subgroup) -table(feeding_all_reshaped$substationid) -save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData") +save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData") #alternatively, save individual files in csv write.csv(feeding_all, file = "./data/feeding_all.csv") -write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped.csv") write.csv(outlet_meta, file = "./data/outlet_meta.csv") write.csv(stations_dt, file = "./data/stations_dt.csv") + + + +# library(ggplot2) +# library(lubridate) +# timecols <- paste0("time", 0:23) +# feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power") +# feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding)) +# hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time +# table(feeding_all_reshaped$power) +# table(feeding_all_reshaped$subgroup) +# table(feeding_all_reshaped$substationid) + + # p <- ggplot(feeding_all_reshaped, aes(x = dateoffeeding_lub, y = feedername, fill = power)) + # geom_tile() + scale_fill_manual(values = c("0" = "red", "1" = "green")) + # facet_wrap(substationname~., drop= T, ncol = 2) diff --git a/R/edl_feeding_postprocess.R b/R/edl_feeding_postprocess.R index 022fd24..2510c8c 100644 --- a/R/edl_feeding_postprocess.R +++ b/R/edl_feeding_postprocess.R @@ -23,9 +23,8 @@ outlet_meta$longitude <- outlet_geocodes$longitude -save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData") +save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData") #alternatively, save individual files in csv write.csv(feeding_all, file = "./data/feeding_all_postprocessed.csv") -write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped_postprocessed.csv") write.csv(outlet_meta, file = "./data/outlet_meta_postprocessed.csv") write.csv(stations_dt, file = "./data/stations_dt_postprocessed.csv") \ No newline at end of file diff --git a/README.md b/README.md index 2286407..894d80a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This project uses openly available data to assess this sector. The feeding data Basic R 4.0 template. See the `Dockerfile` and `install.R` files for modifications from the base template. ## Steps -1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs: +1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs. ``` ## without Renku @@ -18,6 +18,11 @@ Rscript ./R/edl_feeding_postprocess.R ## with Renku renku run Rscript ./R/edl_feeding_crawler.R renku run Rscript ./R/edl_feeding_postprocess.R + +## If you append `update` as an argument, the script will first read available data and complete it with more recent data +Rscript ./R/edl_feeding_crawler.R update +Rscript ./R/edl_feeding_postprocess.R update + ``` By default the script will fetch the data from 20 Oct 2008 to "today". You'll need to modify the script to change the dates. Earlies date (I think) is 20 Oct 2008. diff --git a/data/feeding_all_reshaped.csv b/data/feeding_all_reshaped.csv deleted file mode 100644 index 9b28930..0000000 --- a/data/feeding_all_reshaped.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e -size 524388997 diff --git a/data/feeding_all_reshaped_postprocessed.csv b/data/feeding_all_reshaped_postprocessed.csv deleted file mode 100644 index 9b28930..0000000 --- a/data/feeding_all_reshaped_postprocessed.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e -size 524388997 diff --git a/install.R b/install.R index a40c8c2..01f057e 100644 --- a/install.R +++ b/install.R @@ -1 +1 @@ -install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw")) \ No newline at end of file +install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw", "pbapply")) \ No newline at end of file -- GitLab