diff --git a/R/edl_feeding_crawler.R b/R/edl_feeding_crawler.R index b2c327c5b419be6fe5811606f008ac79200910f9..c3854097458912e12a5bf5b42c1685251f0cdf70 100644 --- a/R/edl_feeding_crawler.R +++ b/R/edl_feeding_crawler.R @@ -1,17 +1,47 @@ +#' Usage: +#' Initial run +#' Rscript ./R/edl_feeding_crawler.R +#' To update data +#' Rscript ./R/edl_feeding_crawler.R update + library(httr) library(xml2) library(dplyr) library(stringi) library(reshape) library(reshape2) -# library(parallel) +library(parallel) +library(pbapply) +pboptions(type = "txt") +ncores <- ifelse(detectCores() >= 6, 6, detecCores()) + +args <- commandArgs(trailingOnly = TRUE) + +if(length(args) == 0){ + update = FALSE +}else{ + update = ifelse(args[1] == "update", TRUE, FALSE) +} + +if(update){ + load("./data/edl_feeding_crawler_output.RData", verbose = T) + feeding_all_old <- feeding_all + outlet_meta_old <- outlet_meta + stations_dt_old <- stations_dt + date1 <- max(lubridate::as_date(feeding_all_old$dateoffeeding) + 1) +}else{ + date1 <- "2008-10-20" +} + +date2 <- as.character(lubridate::today()) + # get station names stations <- read_xml("./data/substations.xml") # manually derived by Maroun Bou Sleiman stations_dt <- data.frame(station_name = xml2::xml_text(xml2::xml_children(stations)), station_id = unlist(xml2::xml_attrs(xml2::xml_children(stations))), stringsAsFactors = F) # get outlets and associated metadata of stations -outlet_meta <- lapply(stations_dt$station_id, function(x){ +outlet_meta <- pblapply(X=stations_dt$station_id, FUN = function(x){ print(x) bd <- list(mode = "feeders-sel2", id = x) res <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd, verbose()) @@ -21,45 +51,45 @@ outlet_meta <- lapply(stations_dt$station_id, function(x){ Encoding(substations_metadata$text) <- "UTF-8" substations_metadata$station_id <- x substations_metadata -} ) +}, cl = ncores ) outlet_meta <- do.call(rbind, outlet_meta) -head(outlet_meta) -# get the supply data -date1 <- "2008-10-20" -date2 <- as.character(lubridate::today()) - - -feeding_all <- lapply(unique(unique(outlet_meta$id)),function(x){ +feeding_all <-pblapply(X=unique(outlet_meta$id), FUN = function(x){ print(x) bd2 <- list(mode = "load", actpre = "act", a_feeders = x, d1 = date1, d2=date2) res2 <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd2) - results <- content(res2, as = "parsed") results <- jsonlite::fromJSON(xml_text(xml2::xml_child(results,1), trim = T))$list_feeders results -}) +}, cl = ncores) feeding_all <- do.call(rbind, feeding_all) -timecols <- paste0("time", 0:23) -feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power") -library(ggplot2) -library(lubridate) -feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding)) -hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time +if(update){ + feeding_all <- rbind(feeding_all_old, feeding_all) +} -table(feeding_all_reshaped$power) -table(feeding_all_reshaped$subgroup) -table(feeding_all_reshaped$substationid) -save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData") +save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData") #alternatively, save individual files in csv write.csv(feeding_all, file = "./data/feeding_all.csv") -write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped.csv") write.csv(outlet_meta, file = "./data/outlet_meta.csv") write.csv(stations_dt, file = "./data/stations_dt.csv") + + + +# library(ggplot2) +# library(lubridate) +# timecols <- paste0("time", 0:23) +# feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power") +# feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding)) +# hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time +# table(feeding_all_reshaped$power) +# table(feeding_all_reshaped$subgroup) +# table(feeding_all_reshaped$substationid) + + # p <- ggplot(feeding_all_reshaped, aes(x = dateoffeeding_lub, y = feedername, fill = power)) + # geom_tile() + scale_fill_manual(values = c("0" = "red", "1" = "green")) + # facet_wrap(substationname~., drop= T, ncol = 2) diff --git a/R/edl_feeding_postprocess.R b/R/edl_feeding_postprocess.R index 022fd24836537b8b115d945f0ce085f7684a2f86..2510c8ca697c65715e2d3f1f0a1fe237295e600e 100644 --- a/R/edl_feeding_postprocess.R +++ b/R/edl_feeding_postprocess.R @@ -23,9 +23,8 @@ outlet_meta$longitude <- outlet_geocodes$longitude -save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData") +save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData") #alternatively, save individual files in csv write.csv(feeding_all, file = "./data/feeding_all_postprocessed.csv") -write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped_postprocessed.csv") write.csv(outlet_meta, file = "./data/outlet_meta_postprocessed.csv") write.csv(stations_dt, file = "./data/stations_dt_postprocessed.csv") \ No newline at end of file diff --git a/README.md b/README.md index 2286407d4c0dee54335959c12d746139f7603441..894d80a506510509558f32a19f7f4cdaa4c88e20 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This project uses openly available data to assess this sector. The feeding data Basic R 4.0 template. See the `Dockerfile` and `install.R` files for modifications from the base template. ## Steps -1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs: +1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs. ``` ## without Renku @@ -18,6 +18,11 @@ Rscript ./R/edl_feeding_postprocess.R ## with Renku renku run Rscript ./R/edl_feeding_crawler.R renku run Rscript ./R/edl_feeding_postprocess.R + +## If you append `update` as an argument, the script will first read available data and complete it with more recent data +Rscript ./R/edl_feeding_crawler.R update +Rscript ./R/edl_feeding_postprocess.R update + ``` By default the script will fetch the data from 20 Oct 2008 to "today". You'll need to modify the script to change the dates. Earlies date (I think) is 20 Oct 2008. diff --git a/data/feeding_all_reshaped.csv b/data/feeding_all_reshaped.csv deleted file mode 100644 index 9b28930d2c468c1ebd367bc2c9f9ff36deaba7c2..0000000000000000000000000000000000000000 --- a/data/feeding_all_reshaped.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e -size 524388997 diff --git a/data/feeding_all_reshaped_postprocessed.csv b/data/feeding_all_reshaped_postprocessed.csv deleted file mode 100644 index 9b28930d2c468c1ebd367bc2c9f9ff36deaba7c2..0000000000000000000000000000000000000000 --- a/data/feeding_all_reshaped_postprocessed.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a226c8b46d8e2a988baf50ae24cf81083c1673e2f92fa044820c7d1a44eee2e -size 524388997 diff --git a/install.R b/install.R index a40c8c220ad0bde96b3d9fccdc772e9c1a5c0acb..01f057e917fb0497c4dbf47c10a811c6b8a3017e 100644 --- a/install.R +++ b/install.R @@ -1 +1 @@ -install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw")) \ No newline at end of file +install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw", "pbapply")) \ No newline at end of file