#functions for scrape.R library(rvest) library(stringr) library(sf) library(jsonlite) library(purrr) # library(dplyr) rowbind <- function(a,b){ rbind( data.frame(c(a, sapply(setdiff(names(b), names(a)), function(x) NA))), data.frame(c(b, sapply(setdiff(names(a), names(b)), function(x) NA)))) } get_urls <- function(url){ html <- read_html(url) ends <- html %>% html_elements(".listing") %>% html_elements("a") %>% html_attr("href") %>% unique() paste("https://ingatlan.com", ends, sep="") } get_rent <- function(url){ # url <- "https://ingatlan.com/xiii-ker/kiado+lakas/csuszozsalus-lakas/33411349" html <- read_html(url) alapok <- html %>% html_elements(".listing-property") %>% html_elements("span") %>% html_text2() %>% unique() alapok <- alapok[!grepl(".*€.*",alapok)] if(length(alapok) == 6){ leiras <- html %>% html_element("#listing-description") %>% html_text2() tipus <- html %>% html_elements(".card-title") %>% html_text2() %>% unique() tablazat <- html %>% html_elements("table") %>% html_table() data <- t(data.frame(alapok[seq(from=2, to=length(alapok), by=2)])) names(data) <- gsub("[[:space:].]+","",alapok[seq(from=1, to=length(alapok), by=2)]) rownames(data) <- 1 data$ar <- alapok[grepl(".*(Ft)|€.*",alapok)][1] data$terulet <- alapok[grepl(".*(m2)",alapok)][1] data$szobak <- alapok[length(alapok)] data$leiras <- leiras data$cim <- tipus[grepl(".*[kK]erület.*",tipus)] data$tipus <- tipus[grepl("^Kiadó.*",tipus)] tdt <- as.data.frame(merge(merge(merge(tablazat[1],tablazat[2],all=T, no.dups=F), merge(tablazat[3],tablazat[4],all=T, no.dups=F),all=T, no.dups=F), merge(tablazat[5],tablazat[6],all=T, no.dups=F),all=T, no.dups=F), no.dups=F) cbind(data,setNames(data.frame(t(tdt[,-1])), gsub("[[:space:].]+","",tdt[,1]))) } else { NA } } arconv <- function(x) { arr <- str_split(x, " ", simplify = T) ar <- as.numeric(str_replace(arr[1], pattern = ",", replacement = ".")) if(arr[2] == "ezer"){ ar*1000 } else if(arr[2] == "millió"){ ar*1000000 } else if(arr[2] == "milliárd"){ ar*1000000000 } else { NA } } get_coords <- function(cim){ url<-URLencode(paste0("http://localhost:8080/search.php?q=",cim,"&limit=1")) data <- read_json(url, simplifyVector = T) if(length(data) == 0){ NA } else { c(data$lat,data$lon) } } hazszamconv <- function(x) { arr <- str_split(x, " ", simplify = T)[,-c(1:2)] logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)}) if(logical){ arr[length(arr)] }else{ NA } } utcaconv <- function(x) { arr <- str_split(x, " ", simplify = T)[,-c(1:2)] logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)}) if(logical){ # paste(arr[-c(length(arr)-1,length(arr))], collapse = " ", sep = " ") paste(arr[-c(length(arr))], collapse = " ", sep = " ") }else{ # paste(arr[-c(length(arr))], collapse = " ", sep = " ") paste(arr, collapse = " ", sep = " ") } }