import requests from bs4 import BeautifulSoup from collections import namedtuple import re import json import os.path import mail_handler import argparse parser = argparse.ArgumentParser(description="Scraper for apartments") parser.add_argument('--populate_lists', action='store_true', help='populate all json files without sending emails') args = parser.parse_args() apartment_type = namedtuple("Apartment", "link location date size shareable rooms price deposit prerent") def do_find_bolig_things(): find_bolig_type = namedtuple("Apartment_find_bolig", "link id") seen_apartments_file_find_bolig = 'seen_apartments_find_bolig.json' # If find_bolig file doesn't exist, create if not os.path.isfile(seen_apartments_file_find_bolig): open(seen_apartments_file_find_bolig, "w+") with open(seen_apartments_file_find_bolig, encoding="utf8") as json_file: text = json_file.read() if len(text) == 0: previously_seen_apartments = [] else: json_data = json.loads(text) previously_seen_apartments = json_data previous_seen = [] for apartment_ in previously_seen_apartments: previous_seen.append(find_bolig_type(*apartment_)) r = requests.get("https://www.findbolig.nu/ledigeboliger/liste.aspx?where=Aarhus%208000&rentmax=7000&showrented=1&showyouth=1&showlimitedperiod=1&showunlimitedperiod=1&showOpenDay=0&focus=ctl00_placeholdersidebar_0_txt_where") soup = BeautifulSoup(r.text, "html5lib") table_body = soup.find(id="GridView_Results").find("tbody") all_apartments = [] rows = table_body.find_all('tr') concatable_string = "https://www.findbolig.nu/Findbolig-nu/Find%20bolig/Ledige%20boliger/Boligpraesentation" for row in rows[1:]: cols = row.find_all('td') aid = re.search('(aid.+)', cols[0].find('a')['href']).group(0) # Hacky :( id = aid.split("=")[1].split("&")[0] link = concatable_string + "/Boligen.aspx?" + aid tmp = find_bolig_type(link, id) all_apartments.append(tmp) def already_seen(already_seens, spec_currently_found) -> bool: return spec_currently_found.id in already_seens already_seen_locations = [tmp.id for tmp in previous_seen] for apartment in all_apartments: if not already_seen(already_seen_locations, apartment): print("I've found a new apartment!") if not args.populate_lists: mail_handler.handle(apartment.link) previous_seen.append(apartment) else: print("I've already seen this") with open(seen_apartments_file_find_bolig, 'w', encoding="utf8") as outfile: json.dump(previous_seen, outfile) def do_hestia_things(): previously_seen_apartments = None seen_apartments_file_hestia = 'seen_apartments_hestia.json' # If hestia file doesn't exist, create if not os.path.isfile(seen_apartments_file_hestia): open(seen_apartments_file_hestia, "w+") with open(seen_apartments_file_hestia, encoding="utf8") as json_file: text = json_file.read() if len(text) == 0: previously_seen_apartments = [] else: json_data = json.loads(text) previously_seen_apartments = json_data previous_seen = [] for apartment_ in previously_seen_apartments: previous_seen.append(apartment_type(*apartment_)) r = requests.get("https://www.hestia.as/ledige-lejligheder/?area=266&max=7200") soup = BeautifulSoup(r.text, "html5lib") table_body = soup.find(id="sortTable").find("tbody") all_apartments = [] rows = table_body.find_all('tr') for row in rows: link = re.search("(https.+|http.+)", row.get("onclick")).group(0) cols = row.find_all('td') text = [col.get_text() for col in cols] all_apartments.append(apartment_type(link[:link.find("\'")], *text)) def already_seen(already_seens, spec_currently_found) -> bool: return spec_currently_found.location in already_seens already_seen_locations = [tmp.location for tmp in previous_seen] for apartment in all_apartments: if not already_seen(already_seen_locations, apartment): print("I've found a new apartment!") if not args.populate_lists: mail_handler.handle(apartment.link) previous_seen.append(apartment) else: print("I've already seen this") with open(seen_apartments_file_hestia, 'w', encoding="utf8") as outfile: json.dump(previous_seen, outfile) do_find_bolig_things()