99 lines
2.9 KiB
R
99 lines
2.9 KiB
R
#functions for scrape.R
|
|
|
|
library(rvest)
|
|
library(stringr)
|
|
library(sf)
|
|
library(jsonlite)
|
|
library(purrr)
|
|
# library(dplyr)
|
|
|
|
rowbind <- function(a,b){
|
|
rbind(
|
|
data.frame(c(a, sapply(setdiff(names(b), names(a)), function(x) NA))),
|
|
data.frame(c(b, sapply(setdiff(names(a), names(b)), function(x) NA))))
|
|
}
|
|
|
|
get_urls <- function(url){
|
|
html <- read_html(url)
|
|
ends <- html %>% html_elements(".listing") %>% html_elements("a") %>%
|
|
html_attr("href") %>% unique()
|
|
paste("https://ingatlan.com", ends, sep="")
|
|
}
|
|
|
|
get_rent <- function(url){
|
|
# url <- "https://ingatlan.com/xiii-ker/kiado+lakas/csuszozsalus-lakas/33411349"
|
|
html <- read_html(url)
|
|
alapok <- html %>% html_elements(".listing-property") %>%
|
|
html_elements("span") %>% html_text2() %>% unique()
|
|
alapok <- alapok[!grepl(".*€.*",alapok)]
|
|
if(length(alapok) == 6){
|
|
leiras <- html %>% html_element("#listing-description") %>% html_text2()
|
|
tipus <- html %>% html_elements(".card-title") %>% html_text2() %>% unique()
|
|
tablazat <- html %>% html_elements("table") %>% html_table()
|
|
|
|
data <- t(data.frame(alapok[seq(from=2, to=length(alapok), by=2)]))
|
|
names(data) <- gsub("[[:space:].]+","",alapok[seq(from=1, to=length(alapok), by=2)])
|
|
rownames(data) <- 1
|
|
data$ar <- alapok[grepl(".*(Ft)|€.*",alapok)][1]
|
|
data$terulet <- alapok[grepl(".*(m2)",alapok)][1]
|
|
data$szobak <- alapok[length(alapok)]
|
|
data$leiras <- leiras
|
|
data$cim <- tipus[grepl(".*[kK]erület.*",tipus)]
|
|
data$tipus <- tipus[grepl("^Kiadó.*",tipus)]
|
|
|
|
tdt <- as.data.frame(merge(merge(merge(tablazat[1],tablazat[2],all=T, no.dups=F),
|
|
merge(tablazat[3],tablazat[4],all=T, no.dups=F),all=T, no.dups=F),
|
|
merge(tablazat[5],tablazat[6],all=T, no.dups=F),all=T, no.dups=F), no.dups=F)
|
|
cbind(data,setNames(data.frame(t(tdt[,-1])), gsub("[[:space:].]+","",tdt[,1])))
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
}
|
|
|
|
arconv <- function(x) {
|
|
arr <- str_split(x, " ", simplify = T)
|
|
ar <- as.numeric(str_replace(arr[1], pattern = ",", replacement = "."))
|
|
if(arr[2] == "ezer"){
|
|
ar*1000
|
|
} else if(arr[2] == "millió"){
|
|
ar*1000000
|
|
} else if(arr[2] == "milliárd"){
|
|
ar*1000000000
|
|
} else {
|
|
NA
|
|
}
|
|
}
|
|
|
|
get_coords <- function(cim){
|
|
url<-URLencode(paste0("http://localhost:8080/search.php?q=",cim,"&limit=1"))
|
|
data <- read_json(url, simplifyVector = T)
|
|
if(length(data) == 0){
|
|
NA
|
|
} else {
|
|
c(data$lat,data$lon)
|
|
}
|
|
}
|
|
|
|
hazszamconv <- function(x) {
|
|
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
|
|
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
|
|
if(logical){
|
|
arr[length(arr)]
|
|
}else{
|
|
NA
|
|
}
|
|
}
|
|
|
|
utcaconv <- function(x) {
|
|
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
|
|
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
|
|
if(logical){
|
|
# paste(arr[-c(length(arr)-1,length(arr))], collapse = " ", sep = " ")
|
|
paste(arr[-c(length(arr))], collapse = " ", sep = " ")
|
|
}else{
|
|
# paste(arr[-c(length(arr))], collapse = " ", sep = " ")
|
|
paste(arr, collapse = " ", sep = " ")
|
|
}
|
|
}
|