tdk/scrape.R

88 lines
2.1 KiB
R

library(stringr)
library(dplyr)
source("funcs.R")
# oldalok számának megszerzése
root_link <- "https://ingatlan.com/lista/kiado+lakas+budapest"
{
pageno <- read_html(root_link) %>% html_elements(".pagination__page-number") %>%
html_text2() %>% strsplit(" ")
pageno <- as.numeric(unlist(pageno)[3])
}
#egyenkénti url-ek leszedése
berelt_url <- ""
data <- data.frame()
na_urls <- c("")
last <- 1
for( i in last:pageno){
last <- i
url <- paste0(root_link,"?page=",as.character(i))
print(paste0(as.character(i),". oldal"))
repeat{
berelt_url <- try(get_urls(url),silent = TRUE)
if (class(berelt_url) != "try-error") {
break
}
if(grepl(".*404.*",berelt_url[1])){
print("404")
break
}
if(grepl(".*403.*",berelt_url[1])){
print(paste("next try:",paste(Sys.time()+60)," ",berelt_url[1]))
system("nmcli c show --active | grep CORVINUS && nmcli c down CORVINUS || nmcli c up CORVINUS")
Sys.sleep(10)
}
Sys.sleep(10)
}
berelt_url <- unique(berelt_url[berelt_url!=""])
lastrent <- 1
for(i in lastrent:length(berelt_url)){
pb <- txtProgressBar(min = 0, # Minimum value of the progress bar
max = length(berelt_url), # Maximum value of the progress bar
style = 3, # Progress bar style (also available style = 1 and style = 2)
char = "=") # Character used to create the bar
lastrent <- i
setTxtProgressBar(pb, i)
url <- berelt_url[i]
#print(paste0(round(i/length(berelt_url)*100,digits = 2),"%"))
repeat{
rent <- try(get_rent(url),silent = TRUE)
if (class(rent) != "try-error") {
break
}
if(grepl(".*404.*",rent[1])){
print("404")
break
}
if(grepl(".*403.*",rent[1])){
print(paste("next try:",paste(Sys.time()+60)," ",rent[1]))
system("nmcli c show --active | grep CORVINUS && nmcli c down CORVINUS || nmcli c up CORVINUS")
Sys.sleep(10)
}
Sys.sleep(10)
}
if(is.data.frame(rent)){
data <- rowbind(data,rent)
} else {
na_urls <- c(na_urls, url)
}
}
close(pb)
}
data <- data[-1,]
data <- data %>% distinct()
dt <- data.table::as.data.table(data)
dt
write.csv(dt, file = "data.csv")
nrow(data)
unique(na_urls)
str(data)