Commit 1bd2b1ff authored by Maroun Bou Sleiman's avatar Maroun Bou Sleiman

added update capability, re-introduced parallelism

parent 44ed2942
#' Usage:
#' Initial run
#' Rscript ./R/edl_feeding_crawler.R
#' To update data
#' Rscript ./R/edl_feeding_crawler.R update
library(httr)
library(xml2)
library(dplyr)
library(stringi)
library(reshape)
library(reshape2)
# library(parallel)
library(parallel)
library(pbapply)
pboptions(type = "txt")
ncores <- ifelse(detectCores() >= 6, 6, detecCores())
args <- commandArgs(trailingOnly = TRUE)
if(length(args) == 0){
update = FALSE
}else{
update = ifelse(args[1] == "update", TRUE, FALSE)
}
if(update){
load("./data/edl_feeding_crawler_output.RData", verbose = T)
feeding_all_old <- feeding_all
outlet_meta_old <- outlet_meta
stations_dt_old <- stations_dt
date1 <- max(lubridate::as_date(feeding_all_old$dateoffeeding) + 1)
}else{
date1 <- "2008-10-20"
}
date2 <- as.character(lubridate::today())
# get station names
stations <- read_xml("./data/substations.xml") # manually derived by Maroun Bou Sleiman
stations_dt <- data.frame(station_name = xml2::xml_text(xml2::xml_children(stations)),
station_id = unlist(xml2::xml_attrs(xml2::xml_children(stations))), stringsAsFactors = F)
# get outlets and associated metadata of stations
outlet_meta <- lapply(stations_dt$station_id, function(x){
outlet_meta <- pblapply(X=stations_dt$station_id, FUN = function(x){
print(x)
bd <- list(mode = "feeders-sel2", id = x)
res <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd, verbose())
......@@ -21,45 +51,45 @@ outlet_meta <- lapply(stations_dt$station_id, function(x){
Encoding(substations_metadata$text) <- "UTF-8"
substations_metadata$station_id <- x
substations_metadata
} )
}, cl = ncores )
outlet_meta <- do.call(rbind, outlet_meta)
head(outlet_meta)
# get the supply data
date1 <- "2008-10-20"
date2 <- as.character(lubridate::today())
feeding_all <- lapply(unique(unique(outlet_meta$id)),function(x){
feeding_all <-pblapply(X=unique(outlet_meta$id), FUN = function(x){
print(x)
bd2 <- list(mode = "load", actpre = "act", a_feeders = x, d1 = date1, d2=date2)
res2 <- POST("http://www.edl.gov.lb/feedingdata.php", body = bd2)
results <- content(res2, as = "parsed")
results <- jsonlite::fromJSON(xml_text(xml2::xml_child(results,1), trim = T))$list_feeders
results
})
}, cl = ncores)
feeding_all <- do.call(rbind, feeding_all)
timecols <- paste0("time", 0:23)
feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power")
library(ggplot2)
library(lubridate)
feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
if(update){
feeding_all <- rbind(feeding_all_old, feeding_all)
}
table(feeding_all_reshaped$power)
table(feeding_all_reshaped$subgroup)
table(feeding_all_reshaped$substationid)
save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData")
save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output.RData")
#alternatively, save individual files in csv
write.csv(feeding_all, file = "./data/feeding_all.csv")
write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped.csv")
write.csv(outlet_meta, file = "./data/outlet_meta.csv")
write.csv(stations_dt, file = "./data/stations_dt.csv")
# library(ggplot2)
# library(lubridate)
# timecols <- paste0("time", 0:23)
# feeding_all_reshaped <- reshape(data = feeding_all, idvar = "id", varying = list(which(colnames(feeding_all) %in% timecols)), direction = "long", v.names = "power")
# feeding_all_reshaped$dateoffeeding_lub <- as_datetime(ymd(feeding_all_reshaped$dateoffeeding))
# hour(feeding_all_reshaped$dateoffeeding_lub) <- feeding_all_reshaped$time
# table(feeding_all_reshaped$power)
# table(feeding_all_reshaped$subgroup)
# table(feeding_all_reshaped$substationid)
# p <- ggplot(feeding_all_reshaped, aes(x = dateoffeeding_lub, y = feedername, fill = power)) +
# geom_tile() + scale_fill_manual(values = c("0" = "red", "1" = "green")) +
# facet_wrap(substationname~., drop= T, ncol = 2)
......
......@@ -23,9 +23,8 @@ outlet_meta$longitude <- outlet_geocodes$longitude
save(feeding_all_reshaped,feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData")
save(feeding_all, outlet_meta, stations_dt, file = "./data/edl_feeding_crawler_output_postprocessed.RData")
#alternatively, save individual files in csv
write.csv(feeding_all, file = "./data/feeding_all_postprocessed.csv")
write.csv(feeding_all_reshaped, file = "./data/feeding_all_reshaped_postprocessed.csv")
write.csv(outlet_meta, file = "./data/outlet_meta_postprocessed.csv")
write.csv(stations_dt, file = "./data/stations_dt_postprocessed.csv")
\ No newline at end of file
......@@ -9,7 +9,7 @@ This project uses openly available data to assess this sector. The feeding data
Basic R 4.0 template. See the `Dockerfile` and `install.R` files for modifications from the base template.
## Steps
1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs:
1. First step is to crawl the http://www.edl.gov.lb/?=&lang=en website, specifically the daily supply section (http://www.edl.gov.lb/feeding.php). This is performed using the `./R/edl_feeding_crawler.R` followed by `./R/edl_feeding_postprocess.R`. It's a time consuming step. It is best to run from terminal as follows if you want to track inputs and outputs.
```
## without Renku
......@@ -18,6 +18,11 @@ Rscript ./R/edl_feeding_postprocess.R
## with Renku
renku run Rscript ./R/edl_feeding_crawler.R
renku run Rscript ./R/edl_feeding_postprocess.R
## If you append `update` as an argument, the script will first read available data and complete it with more recent data
Rscript ./R/edl_feeding_crawler.R update
Rscript ./R/edl_feeding_postprocess.R update
```
By default the script will fetch the data from 20 Oct 2008 to "today". You'll need to modify the script to change the dates. Earlies date (I think) is 20 Oct 2008.
......
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw"))
\ No newline at end of file
install.packages(c("cowplot", "plotly", "geojsonio", "rgdal","leaflet", "reshape", "igraph", "visNetwork", "tidygeocoder", "dtw", "pbapply"))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment