2018-10-09 19:57:47 +00:00
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from collections import namedtuple
|
|
|
|
import re
|
|
|
|
import json
|
|
|
|
import os.path
|
|
|
|
import mail_handler
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Scraper for apartments")
|
|
|
|
parser.add_argument('--populate_lists', action='store_true',
|
|
|
|
help='populate all json files without sending emails')
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2018-10-11 22:59:11 +00:00
|
|
|
hestia_apartment_type = namedtuple("Apartment", "link location date size shareable rooms price deposit prerent")
|
2018-10-09 19:57:47 +00:00
|
|
|
|
2018-10-11 22:59:11 +00:00
|
|
|
# Very long and hideous string
|
|
|
|
taekkerrealestate = "https://access.myestate.dk/search/json/search/808/key:a8ecbabae151abacba7dbde04f761c37/images:true/results:this.caseCallback/?callback=caseCallback&start=0&limit=5000&description_length=300&exclude_sold=1&salg=1&leje=1&investering=1&sizes=490&include_raw_dates=1&sort=created_on&sort_order=desc&_=1539293944236"
|
|
|
|
taekker_max_price = 6800
|
|
|
|
taekker_location = ['8000', '8200']
|
|
|
|
taekker_file = "seen_apartments_taekker.json"
|
2018-10-09 19:57:47 +00:00
|
|
|
|
|
|
|
|
2018-10-11 22:59:11 +00:00
|
|
|
def do_old_file_things(file_name, output_type):
|
2018-10-09 19:57:47 +00:00
|
|
|
# If find_bolig file doesn't exist, create
|
2018-10-11 22:59:11 +00:00
|
|
|
if not os.path.isfile(file_name):
|
|
|
|
open(file_name, "w+")
|
2018-10-09 19:57:47 +00:00
|
|
|
|
2018-10-11 22:59:11 +00:00
|
|
|
with open(file_name, encoding="utf8") as json_file:
|
2018-10-09 19:57:47 +00:00
|
|
|
text = json_file.read()
|
|
|
|
if len(text) == 0:
|
|
|
|
previously_seen_apartments = []
|
|
|
|
else:
|
|
|
|
json_data = json.loads(text)
|
|
|
|
previously_seen_apartments = json_data
|
|
|
|
|
|
|
|
|
|
|
|
previous_seen = []
|
|
|
|
|
|
|
|
for apartment_ in previously_seen_apartments:
|
2018-10-11 22:59:11 +00:00
|
|
|
previous_seen.append(output_type(*apartment_))
|
|
|
|
|
|
|
|
return previous_seen
|
|
|
|
|
|
|
|
|
|
|
|
def do_taeker_things():
|
|
|
|
|
|
|
|
taeker_type = namedtuple("Taeker_type", "link id address ")
|
|
|
|
|
|
|
|
previous_seen = do_old_file_things(taekker_file, taeker_type)
|
|
|
|
|
|
|
|
json_stuff = json.loads(requests.get(taekkerrealestate).text[18:][:-1])
|
|
|
|
|
|
|
|
def find_at_location_at_price(locations, json_stuff):
|
|
|
|
found_apartments = []
|
|
|
|
concat_string = "https://taekkerrealestate.dk"
|
|
|
|
for apartment in json_stuff:
|
|
|
|
try:
|
|
|
|
if apartment['udlejes_fra'] == '01-01-1970':
|
|
|
|
continue
|
|
|
|
if not apartment['postnr'] in locations:
|
|
|
|
continue
|
|
|
|
except KeyError:
|
|
|
|
continue
|
|
|
|
if int(str.join("", apartment['leje_pr_mdr'].split("."))) <= taekker_max_price:
|
|
|
|
url = "{}{}".format(concat_string, apartment['page_url'])
|
|
|
|
found_apartments.append(taeker_type(url, apartment['id'], apartment['adresse']))
|
|
|
|
return found_apartments
|
|
|
|
|
|
|
|
all_apartments = find_at_location_at_price(taekker_location, json_stuff)
|
|
|
|
|
|
|
|
def already_seen(already_seens, spec_currently_found) -> bool:
|
|
|
|
return spec_currently_found.id in already_seens
|
|
|
|
|
|
|
|
already_seen_locations = [tmp.id for tmp in previous_seen]
|
|
|
|
for apartment in all_apartments:
|
|
|
|
if not already_seen(already_seen_locations, apartment):
|
|
|
|
print("I've found a new apartment!")
|
|
|
|
if not args.populate_lists:
|
|
|
|
mail_handler.handle(apartment.link)
|
|
|
|
previous_seen.append(apartment)
|
|
|
|
else:
|
|
|
|
print("I've already seen this")
|
|
|
|
|
|
|
|
with open(taekker_file, 'w', encoding="utf8") as outfile:
|
|
|
|
json.dump(previous_seen, outfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def do_find_bolig_things():
|
|
|
|
find_bolig_type = namedtuple("Apartment_find_bolig", "link id")
|
|
|
|
|
|
|
|
seen_apartments_file_find_bolig = 'seen_apartments_find_bolig.json'
|
|
|
|
|
|
|
|
previous_seen = do_old_file_things(seen_apartments_file_find_bolig, find_bolig_type)
|
2018-10-09 19:57:47 +00:00
|
|
|
|
|
|
|
r = requests.get("https://www.findbolig.nu/ledigeboliger/liste.aspx?where=Aarhus%208000&rentmax=7000&showrented=1&showyouth=1&showlimitedperiod=1&showunlimitedperiod=1&showOpenDay=0&focus=ctl00_placeholdersidebar_0_txt_where")
|
|
|
|
|
|
|
|
soup = BeautifulSoup(r.text, "html5lib")
|
|
|
|
table_body = soup.find(id="GridView_Results").find("tbody")
|
|
|
|
|
|
|
|
all_apartments = []
|
|
|
|
rows = table_body.find_all('tr')
|
|
|
|
|
|
|
|
concatable_string = "https://www.findbolig.nu/Findbolig-nu/Find%20bolig/Ledige%20boliger/Boligpraesentation"
|
|
|
|
for row in rows[1:]:
|
|
|
|
cols = row.find_all('td')
|
|
|
|
aid = re.search('(aid.+)', cols[0].find('a')['href']).group(0)
|
|
|
|
# Hacky :(
|
|
|
|
id = aid.split("=")[1].split("&")[0]
|
|
|
|
link = concatable_string + "/Boligen.aspx?" + aid
|
|
|
|
|
|
|
|
tmp = find_bolig_type(link, id)
|
|
|
|
all_apartments.append(tmp)
|
|
|
|
|
|
|
|
def already_seen(already_seens, spec_currently_found) -> bool:
|
|
|
|
return spec_currently_found.id in already_seens
|
|
|
|
|
|
|
|
already_seen_locations = [tmp.id for tmp in previous_seen]
|
|
|
|
for apartment in all_apartments:
|
|
|
|
if not already_seen(already_seen_locations, apartment):
|
|
|
|
print("I've found a new apartment!")
|
|
|
|
if not args.populate_lists:
|
|
|
|
mail_handler.handle(apartment.link)
|
|
|
|
previous_seen.append(apartment)
|
|
|
|
else:
|
|
|
|
print("I've already seen this")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(seen_apartments_file_find_bolig, 'w', encoding="utf8") as outfile:
|
|
|
|
json.dump(previous_seen, outfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def do_hestia_things():
|
|
|
|
|
|
|
|
seen_apartments_file_hestia = 'seen_apartments_hestia.json'
|
|
|
|
|
2018-10-11 22:59:11 +00:00
|
|
|
previous_seen = do_old_file_things(seen_apartments_file_hestia, hestia_apartment_type)
|
2018-10-09 19:57:47 +00:00
|
|
|
|
|
|
|
r = requests.get("https://www.hestia.as/ledige-lejligheder/?area=266&max=7200")
|
|
|
|
soup = BeautifulSoup(r.text, "html5lib")
|
|
|
|
table_body = soup.find(id="sortTable").find("tbody")
|
|
|
|
|
|
|
|
all_apartments = []
|
|
|
|
|
|
|
|
rows = table_body.find_all('tr')
|
|
|
|
for row in rows:
|
|
|
|
link = re.search("(https.+|http.+)", row.get("onclick")).group(0)
|
|
|
|
cols = row.find_all('td')
|
|
|
|
text = [col.get_text() for col in cols]
|
2018-10-11 22:59:11 +00:00
|
|
|
all_apartments.append(hestia_apartment_type(link[:link.find("\'")], *text))
|
2018-10-09 19:57:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
def already_seen(already_seens, spec_currently_found) -> bool:
|
|
|
|
return spec_currently_found.location in already_seens
|
|
|
|
|
|
|
|
|
|
|
|
already_seen_locations = [tmp.location for tmp in previous_seen]
|
|
|
|
for apartment in all_apartments:
|
|
|
|
if not already_seen(already_seen_locations, apartment):
|
|
|
|
print("I've found a new apartment!")
|
|
|
|
if not args.populate_lists:
|
|
|
|
mail_handler.handle(apartment.link)
|
|
|
|
previous_seen.append(apartment)
|
|
|
|
else:
|
|
|
|
print("I've already seen this")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(seen_apartments_file_hestia, 'w', encoding="utf8") as outfile:
|
|
|
|
json.dump(previous_seen, outfile)
|
|
|
|
|
|
|
|
|
|
|
|
do_find_bolig_things()
|
2018-10-11 22:59:11 +00:00
|
|
|
do_taeker_things()
|
|
|
|
do_hestia_things()
|
2018-10-09 19:57:47 +00:00
|
|
|
|
|
|
|
|