tdk/funcs.R

99 lines
2.9 KiB
R

#functions for scrape.R
library(rvest)
library(stringr)
library(sf)
library(jsonlite)
library(purrr)
# library(dplyr)
rowbind <- function(a,b){
rbind(
data.frame(c(a, sapply(setdiff(names(b), names(a)), function(x) NA))),
data.frame(c(b, sapply(setdiff(names(a), names(b)), function(x) NA))))
}
get_urls <- function(url){
html <- read_html(url)
ends <- html %>% html_elements(".listing") %>% html_elements("a") %>%
html_attr("href") %>% unique()
paste("https://ingatlan.com", ends, sep="")
}
get_rent <- function(url){
# url <- "https://ingatlan.com/xiii-ker/kiado+lakas/csuszozsalus-lakas/33411349"
html <- read_html(url)
alapok <- html %>% html_elements(".listing-property") %>%
html_elements("span") %>% html_text2() %>% unique()
alapok <- alapok[!grepl(".*€.*",alapok)]
if(length(alapok) == 6){
leiras <- html %>% html_element("#listing-description") %>% html_text2()
tipus <- html %>% html_elements(".card-title") %>% html_text2() %>% unique()
tablazat <- html %>% html_elements("table") %>% html_table()
data <- t(data.frame(alapok[seq(from=2, to=length(alapok), by=2)]))
names(data) <- gsub("[[:space:].]+","",alapok[seq(from=1, to=length(alapok), by=2)])
rownames(data) <- 1
data$ar <- alapok[grepl(".*(Ft)|€.*",alapok)][1]
data$terulet <- alapok[grepl(".*(m2)",alapok)][1]
data$szobak <- alapok[length(alapok)]
data$leiras <- leiras
data$cim <- tipus[grepl(".*[kK]erület.*",tipus)]
data$tipus <- tipus[grepl("^Kiadó.*",tipus)]
tdt <- as.data.frame(merge(merge(merge(tablazat[1],tablazat[2],all=T, no.dups=F),
merge(tablazat[3],tablazat[4],all=T, no.dups=F),all=T, no.dups=F),
merge(tablazat[5],tablazat[6],all=T, no.dups=F),all=T, no.dups=F), no.dups=F)
cbind(data,setNames(data.frame(t(tdt[,-1])), gsub("[[:space:].]+","",tdt[,1])))
} else {
NA
}
}
arconv <- function(x) {
arr <- str_split(x, " ", simplify = T)
ar <- as.numeric(str_replace(arr[1], pattern = ",", replacement = "."))
if(arr[2] == "ezer"){
ar*1000
} else if(arr[2] == "millió"){
ar*1000000
} else if(arr[2] == "milliárd"){
ar*1000000000
} else {
NA
}
}
get_coords <- function(cim){
url<-URLencode(paste0("http://localhost:8080/search.php?q=",cim,"&limit=1"))
data <- read_json(url, simplifyVector = T)
if(length(data) == 0){
NA
} else {
c(data$lat,data$lon)
}
}
hazszamconv <- function(x) {
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
if(logical){
arr[length(arr)]
}else{
NA
}
}
utcaconv <- function(x) {
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
if(logical){
# paste(arr[-c(length(arr)-1,length(arr))], collapse = " ", sep = " ")
paste(arr[-c(length(arr))], collapse = " ", sep = " ")
}else{
# paste(arr[-c(length(arr))], collapse = " ", sep = " ")
paste(arr, collapse = " ", sep = " ")
}
}