albérletes dolgok egyben

This commit is contained in:
nagbalae 2023-04-02 16:16:48 +02:00
parent 7988d6cbc5
commit 2b82d298ed
26 changed files with 235397 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata

Binary file not shown.

28847
adat.R Normal file

File diff suppressed because one or more lines are too long

46981
adat_pozicioval.csv Normal file

File diff suppressed because it is too large Load Diff

4062
albi.html Normal file

File diff suppressed because one or more lines are too long

60128
bak.csv Normal file

File diff suppressed because it is too large Load Diff

16
beolvas.R Normal file
View File

@ -0,0 +1,16 @@
library(data.table)
library(dplyr)
# df <- read.csv("data.csv") %>% as.data.table()
df <- read.csv("adat_pozicioval.csv") %>% as.data.table()
df <- df[,-c("X","X.1")]
factorlist <- c("tipus","Akadálymentesített","Átlagáramfogyasztásinfo","Átlaggázfogyasztásinfo",
"Belmagasság","Bútorozott","Dohányzás","Emelet","Energiatanúsítvány","Építéséve",
"Épületszintjei","Fürdőéswc","Fűtés","Gépesített","Ingatlanállapota",
"Kertkapcsolatos","Kilátás","Kisállat","Komfort","Költözhető","Közösköltség",
"Légkondicionáló","Lift","Minbérletiidő","Parkolás","Rezsiköltség",
"Szigetelés","Tájolás","Tetőtér","Panelprogram","Parkolóhelyára")
df[,factorlist]<- lapply(df[,..factorlist],function(x){ifelse(x=="nincs megadva",NA,x)})
df[,factorlist]<- lapply(df[,..factorlist],as.factor)
# Erkélymérete
# lapply(df,typeof)
### tavolsag.R kovetkezik

61891
data.csv Normal file

File diff suppressed because it is too large Load Diff

5127
data/prc_fsc_idx.tsv Normal file

File diff suppressed because it is too large Load Diff

13
docs/bevezetes Normal file
View File

@ -0,0 +1,13 @@
# Bevezetés
Az előző pár évben az ingatlanok árai jelentős emelkedésen mentek keresztül (valami portfolio cikk vagy ilyesmi cite)
, ezzel egyre kevesebb embernek van lehetősége, adatik meg, hogy a jelenlegi magas árak mellett is saját lakást tudjanak venni.
Ennek köszönhetően az ingatlan bérpiacnak nőtt a jelentősége, és az ezen a piacon megfigyelhető árazási faktorok jelentős hatással
lehetnek ezrek mindennapi életére.
Elemzésemben a budapesti ingatlanok áraira hatással levő változókat elemzem, mind a mennyiségi, minőségi és területi változókra
kitérve a teljesség kedvéért.
Az elemzésben az Ingatlan.com-ról scrapelt adatokat kombináltam az OpenStreetMaps nevű online térkép adataival, így megkapva a
felhasznált bérlakás adathalmazt, amelyet "Geographically Weighted Regression" azaz földrajzilag súlyozott regresszióval elemzek.
#Absztrakt
A közelmúltban jelentősen emelked

15793
epitett.json Normal file

File diff suppressed because it is too large Load Diff

98
funcs.R Normal file
View File

@ -0,0 +1,98 @@
#functions for scrape.R
library(rvest)
library(stringr)
library(sf)
library(jsonlite)
library(purrr)
# library(dplyr)
rowbind <- function(a,b){
rbind(
data.frame(c(a, sapply(setdiff(names(b), names(a)), function(x) NA))),
data.frame(c(b, sapply(setdiff(names(a), names(b)), function(x) NA))))
}
get_urls <- function(url){
html <- read_html(url)
ends <- html %>% html_elements(".listing") %>% html_elements("a") %>%
html_attr("href") %>% unique()
paste("https://ingatlan.com", ends, sep="")
}
get_rent <- function(url){
# url <- "https://ingatlan.com/xiii-ker/kiado+lakas/csuszozsalus-lakas/33411349"
html <- read_html(url)
alapok <- html %>% html_elements(".listing-property") %>%
html_elements("span") %>% html_text2() %>% unique()
alapok <- alapok[!grepl(".*€.*",alapok)]
if(length(alapok) == 6){
leiras <- html %>% html_element("#listing-description") %>% html_text2()
tipus <- html %>% html_elements(".card-title") %>% html_text2() %>% unique()
tablazat <- html %>% html_elements("table") %>% html_table()
data <- t(data.frame(alapok[seq(from=2, to=length(alapok), by=2)]))
names(data) <- gsub("[[:space:].]+","",alapok[seq(from=1, to=length(alapok), by=2)])
rownames(data) <- 1
data$ar <- alapok[grepl(".*(Ft)|€.*",alapok)][1]
data$terulet <- alapok[grepl(".*(m2)",alapok)][1]
data$szobak <- alapok[length(alapok)]
data$leiras <- leiras
data$cim <- tipus[grepl(".*[kK]erület.*",tipus)]
data$tipus <- tipus[grepl("^Kiadó.*",tipus)]
tdt <- as.data.frame(merge(merge(merge(tablazat[1],tablazat[2],all=T, no.dups=F),
merge(tablazat[3],tablazat[4],all=T, no.dups=F),all=T, no.dups=F),
merge(tablazat[5],tablazat[6],all=T, no.dups=F),all=T, no.dups=F), no.dups=F)
cbind(data,setNames(data.frame(t(tdt[,-1])), gsub("[[:space:].]+","",tdt[,1])))
} else {
NA
}
}
arconv <- function(x) {
arr <- str_split(x, " ", simplify = T)
ar <- as.numeric(str_replace(arr[1], pattern = ",", replacement = "."))
if(arr[2] == "ezer"){
ar*1000
} else if(arr[2] == "millió"){
ar*1000000
} else if(arr[2] == "milliárd"){
ar*1000000000
} else {
NA
}
}
get_coords <- function(cim){
url<-URLencode(paste0("http://localhost:8080/search.php?q=",cim,"&limit=1"))
data <- read_json(url, simplifyVector = T)
if(length(data) == 0){
NA
} else {
c(data$lat,data$lon)
}
}
hazszamconv <- function(x) {
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
if(logical){
arr[length(arr)]
}else{
NA
}
}
utcaconv <- function(x) {
arr <- str_split(x, " ", simplify = T)[,-c(1:2)]
logical <- sapply(x,function(x){tail(grepl("[0-9]+",str_split(x, " ", simplify = T)[,-c(1:2)]), n=1)})
if(logical){
# paste(arr[-c(length(arr)-1,length(arr))], collapse = " ", sep = " ")
paste(arr[-c(length(arr))], collapse = " ", sep = " ")
}else{
# paste(arr[-c(length(arr))], collapse = " ", sep = " ")
paste(arr, collapse = " ", sep = " ")
}
}

16
mapout.R Normal file
View File

@ -0,0 +1,16 @@
library(leaflet)
library(purrr)
# df2 <- df[sample(1:nrow(df),4000),]
df2 <- df
bins <- c(0, quantile(df$ar/df$ater,probs=c(0.2,0.4,0.6,0.8)), Inf) # Kvintilisek
pal <- colorBin("YlOrRd", domain = df2$ar/df2$ater, bins = bins)
leaflet(df2) %>%
addTiles() %>%
# addCircleMarkers(~lon,~lat,popup = df2$cim,radius = 10,fillColor = ~pal(ar/ater),fillOpacity = 0.8,stroke = F) %>%
addMarkers(~lon,~lat,popup = df2$cim) %>%
addLegend(pal = pal, values = df2$ar/df2$ater, title = "Ft/m2/hó",
position = "bottomright") %>%
addMarkers(metro$lon, metro$lat,popup = metro$name) %>%
addMarkers(vonat$lon, vonat$lat,popup = vonat$name)

4062
metro.html Normal file

File diff suppressed because one or more lines are too long

30
model.R Normal file
View File

@ -0,0 +1,30 @@
library(GWmodel)
df3 <- df[df$tipus != "Kiadó szoba",]
df3 <- df3[df3$ar <= 400000,]
df4 <- df3[,c("ar","ater","szoba","felszoba","ker","metrotav","vonattav")]
lmmod <- lm(log(ar)~ater+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
lmmod <- lm(log(ar)~as.factor(ker)+ater+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
lmmod <- lm(log(ar)~as.factor(ker)+ater+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
lmmod <- lm(log(ar)~as.factor(ker)+log(ater)+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
lmmod <- lm(log(ar/ater)~as.factor(ker)+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
lmmod <- lm((ar/ater)~as.factor(ker)+log(tomkoz+1)+szoba+felszoba+Emelet,df3)
hist(df4$ar/df4$ater)
hist(log(df4$ar/df4$ater))
summary(lmmod)
str(df3)
hist(log(df4$ater))
min(df3$ar)
pairs(df3[,39:48])
str(df3)
hist(log(df3$tomkoz))
hist(log(df3$vonattav))
hist(log(df3$metrotav))
hist(df3$ar)
hist(log(df3$ar))
hist(df3$ater)
hist(log(df3$ater))
plot(as.factor(df3$ker))
plot(df3$ar,df3$tomkoz)
cor(df3[,c("ater","ar","szoba","felszoba","metrotav","vonattav")])
colnames(df3)
psych::describe(df3)

Binary file not shown.

BIN
rossz/65-162-4-PB.pdf Normal file

Binary file not shown.

87
scrape.R Normal file
View File

@ -0,0 +1,87 @@
library(stringr)
library(dplyr)
source("funcs.R")
# oldalok számának megszerzése
root_link <- "https://ingatlan.com/lista/kiado+lakas+budapest"
{
pageno <- read_html(root_link) %>% html_elements(".pagination__page-number") %>%
html_text2() %>% strsplit(" ")
pageno <- as.numeric(unlist(pageno)[3])
}
#egyenkénti url-ek leszedése
berelt_url <- ""
data <- data.frame()
na_urls <- c("")
last <- 1
for( i in last:pageno){
last <- i
url <- paste0(root_link,"?page=",as.character(i))
print(paste0(as.character(i),". oldal"))
repeat{
berelt_url <- try(get_urls(url),silent = TRUE)
if (class(berelt_url) != "try-error") {
break
}
if(grepl(".*404.*",berelt_url[1])){
print("404")
break
}
if(grepl(".*403.*",berelt_url[1])){
print(paste("next try:",paste(Sys.time()+60)," ",berelt_url[1]))
system("nmcli c show --active | grep CORVINUS && nmcli c down CORVINUS || nmcli c up CORVINUS")
Sys.sleep(10)
}
Sys.sleep(10)
}
berelt_url <- unique(berelt_url[berelt_url!=""])
lastrent <- 1
for(i in lastrent:length(berelt_url)){
pb <- txtProgressBar(min = 0, # Minimum value of the progress bar
max = length(berelt_url), # Maximum value of the progress bar
style = 3, # Progress bar style (also available style = 1 and style = 2)
char = "=") # Character used to create the bar
lastrent <- i
setTxtProgressBar(pb, i)
url <- berelt_url[i]
#print(paste0(round(i/length(berelt_url)*100,digits = 2),"%"))
repeat{
rent <- try(get_rent(url),silent = TRUE)
if (class(rent) != "try-error") {
break
}
if(grepl(".*404.*",rent[1])){
print("404")
break
}
if(grepl(".*403.*",rent[1])){
print(paste("next try:",paste(Sys.time()+60)," ",rent[1]))
system("nmcli c show --active | grep CORVINUS && nmcli c down CORVINUS || nmcli c up CORVINUS")
Sys.sleep(10)
}
Sys.sleep(10)
}
if(is.data.frame(rent)){
data <- rowbind(data,rent)
} else {
na_urls <- c(na_urls, url)
}
}
close(pb)
}
data <- data[-1,]
data <- data %>% distinct()
dt <- data.table::as.data.table(data)
dt
write.csv(dt, file = "data.csv")
nrow(data)
unique(na_urls)
str(data)

21
scratchpad.R Normal file
View File

@ -0,0 +1,21 @@
html <- read_html(url)
alapok <- html %>% html_elements(".listing-property") %>%
html_elements("span") %>% html_text2() %>% unique()
alapok <- alapok[!grepl(".*€.*",alapok)]
alapok
ar <- alapok[grepl(".*(ft)|€.*",alapok)][1]
ar
get_coords(df$cim[1])
url<-URLencode(paste0("http://localhost:8080/search.php?q=",df$cim[1]))
data <- head(read_json(url, simplifyVector = T),n=1)
if(length(data) == 0){
NA
} else {
c(data$lat,data$lon)
}
dump("df","adat.R")
dfbak <- df
source("adat.R")

2
spacialregr.R Normal file
View File

@ -0,0 +1,2 @@
library(sp)
sdf <- SpatialPointsDataFrame(df3[,c("lon","lat")],df3)

34
tavolsag.R Normal file
View File

@ -0,0 +1,34 @@
library(osmdata)
library(sf)
library(sfheaders)
library(geosphere)
getmindist<-function(x1,y1,x2,y2){
tav <- distm(data.frame(x1,y1),data.frame(x2,y2))
apply(tav,1,min, simplify = T)
}
metro <- getbb("Budapest") %>%
opq() %>%
add_osm_feature("station","subway") %>%
osmdata_sf()
metro <- data.frame(metro$osm_points)
metro$lat <- sapply(metro$geometry, unlist)[2,]
metro$lon <- sapply(metro$geometry, unlist)[1,]
vonat <- getbb("Budapest") %>%
opq()%>%
add_osm_feature(c("railway","train"),c("station","yes"))%>%
osmdata_sf()
vonat <- vonat$osm_points
vonat$lat <- sapply(vonat$geometry,unlist)[2,]
vonat$lon <- sapply(vonat$geometry,unlist)[1,]
df$metrotav <- getmindist(df$lon,df$lat,metro$lon,metro$lat)
df$vonattav <- getmindist(df$lon,df$lat,vonat$lon,vonat$lat)
df$tomkoz <- apply(df[,c("vonattav","metrotav")],1,min, simplify = T)

16
tdk.Rproj Normal file
View File

@ -0,0 +1,16 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: No
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

25
terkepek.R Normal file
View File

@ -0,0 +1,25 @@
bins <- c(0, quantile(df$vonattav,probs=c(0.02,0.05,0.1,0.25,0.35,0.5,0.75,0.9)), Inf) # Kvintilisek
pal <- colorBin("YlOrRd", domain = df$vonattav, bins = bins)
leaflet(df) |>
addTiles() |>
addCircles(~lon,~lat,popup = paste0(df$cim,"<br>",round(df$metrotav,2)," m"),radius = 80,fillColor = ~pal(vonattav),fillOpacity = 0.8,stroke = F) |>
addLegend(pal = pal, values = df$vonattav, title = "Távolság a vonattól",
position = "bottomright") |>
addMarkers(vonat$lon, vonat$lat,popup = vonat$name)
leaflet(df) |>
addTiles() |>
addCircles(~lon,~lat,popup = paste0(df$cim,"<br>",round(df$metrotav,2)," m"),radius = 80,fillColor = ~pal(metrotav),fillOpacity = 0.8,stroke = F) |>
addLegend(pal = pal, values = df$metrotav, title = "Távolság a metrótól",
position = "bottomright") |>
addMarkers(metro$lon, metro$lat,popup = metro$name)
leaflet(df) |>
addTiles() |>
addCircles(~lon,~lat,popup = paste0(df$cim,"<br>",round(df$tomkoz,2)," m"),radius = 100,fillColor = ~pal(tomkoz),fillOpacity = 0.8,stroke = F) |>
addLegend(pal = pal, values = df$tomkoz, title = "Távolság a legközelebbi <br> metrótól vagy vonattól",
position = "bottomright") |>
addMarkers(metro$lon, metro$lat,popup = metro$name) |>
addMarkers(vonat$lon, vonat$lat,popup = vonat$name)

20
tisztitas.R Normal file
View File

@ -0,0 +1,20 @@
source("funcs.R")
df$ater <-as.numeric(str_split(df$Alapterület, " ",n=2,simplify = T)[,1])
df$ar <- sapply(df$Árhavonta, arconv)
df$ker <- as.factor(as.numeric(as.roman(str_remove(str_split(df$cim, " ", simplify = T)[,1],"\\."))))
# ez hosszú !!
df$point <- sapply(df$cim, get_coords)
df <- df[!is.na(df$point),]
df$lat <- as.double(unlist(map(df$point,1)))
df$lon <- as.double(unlist(map(df[!is.na(df$point),]$point,2)))
df$szoba <- ifelse(grepl("fél",df$Szobák),as.numeric(unlist(map(str_split(df$Szobák," "),1))),as.numeric(df$Szobák))
df$felszoba <- ifelse(grepl("fél",df$Szobák),as.numeric(unlist(map(str_split(df$Szobák," "),3))),0)
# unlist(map(str_split(df$Szobák," "),3))
# df[felszoba==3,]$Szobák
# summary(grepl("fél",df$Szobák))
write.csv(df3,file = "adat_pozicioval.csv")

4062
tomkoz.html Normal file

File diff suppressed because one or more lines are too long

4062
vonat.html Normal file

File diff suppressed because one or more lines are too long