apartment_scrapers/apartment_looker.py

import requests
from bs4 import BeautifulSoup
from collections import namedtuple
import re
import json
import os.path
import mail_handler
import argparse

parser = argparse.ArgumentParser(description="Scraper for apartments")
parser.add_argument('--populate_lists', action='store_true',
                    help='populate all json files without sending emails')

args = parser.parse_args()

hestia_apartment_type = namedtuple("Apartment", "link location date size shareable rooms price deposit prerent")

# Very long and hideous string
taekkerrealestate = "https://access.myestate.dk/search/json/search/808/key:a8ecbabae151abacba7dbde04f761c37/images:true/results:this.caseCallback/?callback=caseCallback&start=0&limit=5000&description_length=300&exclude_sold=1&salg=1&leje=1&investering=1&sizes=490&include_raw_dates=1&sort=created_on&sort_order=desc&_=1539293944236"
taekker_max_price = 6800
taekker_location = ['8000', '8200']
taekker_file = "seen_apartments_taekker.json"


def do_old_file_things(file_name, output_type):
    # If find_bolig file doesn't exist, create
    if not os.path.isfile(file_name):
        open(file_name, "w+")

    with open(file_name, encoding="utf8") as json_file:
        text = json_file.read()
        if len(text) == 0:
            previously_seen_apartments = []
        else:
            json_data = json.loads(text)
            previously_seen_apartments = json_data


    previous_seen = []

    for apartment_ in previously_seen_apartments:
        previous_seen.append(output_type(*apartment_))

    return previous_seen


def do_taeker_things():

    taeker_type = namedtuple("Taeker_type", "link id address ")

    previous_seen = do_old_file_things(taekker_file, taeker_type)

    json_stuff = json.loads(requests.get(taekkerrealestate).text[18:][:-1])

    def find_at_location_at_price(locations, json_stuff):
        found_apartments = []
        concat_string = "https://taekkerrealestate.dk"
        for apartment in json_stuff:
            try:
                if apartment['udlejes_fra'] == '01-01-1970':
                    continue
                if not apartment['postnr'] in locations:
                    continue
            except KeyError:
                continue
            if int(str.join("", apartment['leje_pr_mdr'].split("."))) <= taekker_max_price:
                url = "{}{}".format(concat_string, apartment['page_url'])
                found_apartments.append(taeker_type(url, apartment['id'], apartment['adresse']))
        return found_apartments

    all_apartments = find_at_location_at_price(taekker_location, json_stuff)

    def already_seen(already_seens, spec_currently_found) -> bool:
        return spec_currently_found.id in already_seens

    already_seen_locations = [tmp.id for tmp in previous_seen]
    for apartment in all_apartments:
        if not already_seen(already_seen_locations, apartment):
            print("I've found a new apartment!")
            if not args.populate_lists:
                mail_handler.handle(apartment.link)
            previous_seen.append(apartment)
        else:
            print("I've already seen this")

    with open(taekker_file, 'w', encoding="utf8") as outfile:
        json.dump(previous_seen, outfile)


def do_find_bolig_things():
    find_bolig_type = namedtuple("Apartment_find_bolig", "link id")

    seen_apartments_file_find_bolig = 'seen_apartments_find_bolig.json'

    previous_seen = do_old_file_things(seen_apartments_file_find_bolig, find_bolig_type)

    r = requests.get("https://www.findbolig.nu/ledigeboliger/liste.aspx?where=Aarhus%208000&rentmax=7000&showrented=1&showyouth=1&showlimitedperiod=1&showunlimitedperiod=1&showOpenDay=0&focus=ctl00_placeholdersidebar_0_txt_where")

    soup = BeautifulSoup(r.text, "html5lib")
    table_body = soup.find(id="GridView_Results").find("tbody")

    all_apartments = []
    rows = table_body.find_all('tr')

    concatable_string = "https://www.findbolig.nu/Findbolig-nu/Find%20bolig/Ledige%20boliger/Boligpraesentation"
    for row in rows[1:]:
        cols = row.find_all('td')
        aid = re.search('(aid.+)', cols[0].find('a')['href']).group(0)
        # Hacky :(
        id = aid.split("=")[1].split("&")[0]
        link = concatable_string + "/Boligen.aspx?" + aid

        tmp = find_bolig_type(link, id)
        all_apartments.append(tmp)

    def already_seen(already_seens, spec_currently_found) -> bool:
        return spec_currently_found.id in already_seens

    already_seen_locations = [tmp.id for tmp in previous_seen]
    for apartment in all_apartments:
        if not already_seen(already_seen_locations, apartment):
            print("I've found a new apartment!")
            if not args.populate_lists:
                mail_handler.handle(apartment.link)
            previous_seen.append(apartment)
        else:
            print("I've already seen this")


    with open(seen_apartments_file_find_bolig, 'w', encoding="utf8") as outfile:
        json.dump(previous_seen, outfile)


def do_hestia_things():

    seen_apartments_file_hestia = 'seen_apartments_hestia.json'

    previous_seen = do_old_file_things(seen_apartments_file_hestia, hestia_apartment_type)

    r = requests.get("https://www.hestia.as/ledige-lejligheder/?area=266&max=7200")
    soup = BeautifulSoup(r.text, "html5lib")
    table_body = soup.find(id="sortTable").find("tbody")

    all_apartments = []

    rows = table_body.find_all('tr')
    for row in rows:
        link = re.search("(https.+|http.+)", row.get("onclick")).group(0)
        cols = row.find_all('td')
        text = [col.get_text() for col in cols]
        all_apartments.append(hestia_apartment_type(link[:link.find("\'")], *text))


    def already_seen(already_seens, spec_currently_found) -> bool:
        return spec_currently_found.location in already_seens


    already_seen_locations = [tmp.location for tmp in previous_seen]
    for apartment in all_apartments:
        if not already_seen(already_seen_locations, apartment):
            print("I've found a new apartment!")
            if not args.populate_lists:
                mail_handler.handle(apartment.link)
            previous_seen.append(apartment)
        else:
            print("I've already seen this")


    with open(seen_apartments_file_hestia, 'w', encoding="utf8") as outfile:
        json.dump(previous_seen, outfile)


do_find_bolig_things()
do_taeker_things()
do_hestia_things()
Initial commit 2018-10-09 19:57:47 +00:00			`import requests`
			`from bs4 import BeautifulSoup`
			`from collections import namedtuple`
			`import re`
			`import json`
			`import os.path`
			`import mail_handler`
			`import argparse`

			`parser = argparse.ArgumentParser(description="Scraper for apartments")`
			`parser.add_argument('--populate_lists', action='store_true',`
			`help='populate all json files without sending emails')`

			`args = parser.parse_args()`

Added taekker real estate 2018-10-11 22:59:11 +00:00			`hestia_apartment_type = namedtuple("Apartment", "link location date size shareable rooms price deposit prerent")`
Initial commit 2018-10-09 19:57:47 +00:00
Added taekker real estate 2018-10-11 22:59:11 +00:00			`# Very long and hideous string`
			`taekkerrealestate = "https://access.myestate.dk/search/json/search/808/key:a8ecbabae151abacba7dbde04f761c37/images:true/results:this.caseCallback/?callback=caseCallback&start=0&limit=5000&description_length=300&exclude_sold=1&salg=1&leje=1&investering=1&sizes=490&include_raw_dates=1&sort=created_on&sort_order=desc&_=1539293944236"`
			`taekker_max_price = 6800`
			`taekker_location = ['8000', '8200']`
			`taekker_file = "seen_apartments_taekker.json"`
Initial commit 2018-10-09 19:57:47 +00:00

Added taekker real estate 2018-10-11 22:59:11 +00:00			`def do_old_file_things(file_name, output_type):`
Initial commit 2018-10-09 19:57:47 +00:00			`# If find_bolig file doesn't exist, create`
Added taekker real estate 2018-10-11 22:59:11 +00:00			`if not os.path.isfile(file_name):`
			`open(file_name, "w+")`
Initial commit 2018-10-09 19:57:47 +00:00
Added taekker real estate 2018-10-11 22:59:11 +00:00			`with open(file_name, encoding="utf8") as json_file:`
Initial commit 2018-10-09 19:57:47 +00:00			`text = json_file.read()`
			`if len(text) == 0:`
			`previously_seen_apartments = []`
			`else:`
			`json_data = json.loads(text)`
			`previously_seen_apartments = json_data`


			`previous_seen = []`

			`for apartment_ in previously_seen_apartments:`
Added taekker real estate 2018-10-11 22:59:11 +00:00			`previous_seen.append(output_type(*apartment_))`

			`return previous_seen`


			`def do_taeker_things():`

			`taeker_type = namedtuple("Taeker_type", "link id address ")`

			`previous_seen = do_old_file_things(taekker_file, taeker_type)`

			`json_stuff = json.loads(requests.get(taekkerrealestate).text[18:][:-1])`

			`def find_at_location_at_price(locations, json_stuff):`
			`found_apartments = []`
			`concat_string = "https://taekkerrealestate.dk"`
			`for apartment in json_stuff:`
			`try:`
			`if apartment['udlejes_fra'] == '01-01-1970':`
			`continue`
			`if not apartment['postnr'] in locations:`
			`continue`
			`except KeyError:`
			`continue`
			`if int(str.join("", apartment['leje_pr_mdr'].split("."))) <= taekker_max_price:`
			`url = "{}{}".format(concat_string, apartment['page_url'])`
			`found_apartments.append(taeker_type(url, apartment['id'], apartment['adresse']))`
			`return found_apartments`

			`all_apartments = find_at_location_at_price(taekker_location, json_stuff)`

			`def already_seen(already_seens, spec_currently_found) -> bool:`
			`return spec_currently_found.id in already_seens`

			`already_seen_locations = [tmp.id for tmp in previous_seen]`
			`for apartment in all_apartments:`
			`if not already_seen(already_seen_locations, apartment):`
			`print("I've found a new apartment!")`
			`if not args.populate_lists:`
			`mail_handler.handle(apartment.link)`
			`previous_seen.append(apartment)`
			`else:`
			`print("I've already seen this")`

			`with open(taekker_file, 'w', encoding="utf8") as outfile:`
			`json.dump(previous_seen, outfile)`



			`def do_find_bolig_things():`
			`find_bolig_type = namedtuple("Apartment_find_bolig", "link id")`

			`seen_apartments_file_find_bolig = 'seen_apartments_find_bolig.json'`

			`previous_seen = do_old_file_things(seen_apartments_file_find_bolig, find_bolig_type)`
Initial commit 2018-10-09 19:57:47 +00:00
			`r = requests.get("https://www.findbolig.nu/ledigeboliger/liste.aspx?where=Aarhus%208000&rentmax=7000&showrented=1&showyouth=1&showlimitedperiod=1&showunlimitedperiod=1&showOpenDay=0&focus=ctl00_placeholdersidebar_0_txt_where")`

			`soup = BeautifulSoup(r.text, "html5lib")`
			`table_body = soup.find(id="GridView_Results").find("tbody")`

			`all_apartments = []`
			`rows = table_body.find_all('tr')`

			`concatable_string = "https://www.findbolig.nu/Findbolig-nu/Find%20bolig/Ledige%20boliger/Boligpraesentation"`
			`for row in rows[1:]:`
			`cols = row.find_all('td')`
			`aid = re.search('(aid.+)', cols[0].find('a')['href']).group(0)`
			`# Hacky :(`
			`id = aid.split("=")[1].split("&")[0]`
			`link = concatable_string + "/Boligen.aspx?" + aid`

			`tmp = find_bolig_type(link, id)`
			`all_apartments.append(tmp)`

			`def already_seen(already_seens, spec_currently_found) -> bool:`
			`return spec_currently_found.id in already_seens`

			`already_seen_locations = [tmp.id for tmp in previous_seen]`
			`for apartment in all_apartments:`
			`if not already_seen(already_seen_locations, apartment):`
			`print("I've found a new apartment!")`
			`if not args.populate_lists:`
			`mail_handler.handle(apartment.link)`
			`previous_seen.append(apartment)`
			`else:`
			`print("I've already seen this")`



			`with open(seen_apartments_file_find_bolig, 'w', encoding="utf8") as outfile:`
			`json.dump(previous_seen, outfile)`



			`def do_hestia_things():`

			`seen_apartments_file_hestia = 'seen_apartments_hestia.json'`

Added taekker real estate 2018-10-11 22:59:11 +00:00			`previous_seen = do_old_file_things(seen_apartments_file_hestia, hestia_apartment_type)`
Initial commit 2018-10-09 19:57:47 +00:00
			`r = requests.get("https://www.hestia.as/ledige-lejligheder/?area=266&max=7200")`
			`soup = BeautifulSoup(r.text, "html5lib")`
			`table_body = soup.find(id="sortTable").find("tbody")`

			`all_apartments = []`

			`rows = table_body.find_all('tr')`
			`for row in rows:`
			`link = re.search("(https.+\|http.+)", row.get("onclick")).group(0)`
			`cols = row.find_all('td')`
			`text = [col.get_text() for col in cols]`
Added taekker real estate 2018-10-11 22:59:11 +00:00			`all_apartments.append(hestia_apartment_type(link[:link.find("\'")], *text))`
Initial commit 2018-10-09 19:57:47 +00:00

			`def already_seen(already_seens, spec_currently_found) -> bool:`
			`return spec_currently_found.location in already_seens`


			`already_seen_locations = [tmp.location for tmp in previous_seen]`
			`for apartment in all_apartments:`
			`if not already_seen(already_seen_locations, apartment):`
			`print("I've found a new apartment!")`
			`if not args.populate_lists:`
			`mail_handler.handle(apartment.link)`
			`previous_seen.append(apartment)`
			`else:`
			`print("I've already seen this")`



			`with open(seen_apartments_file_hestia, 'w', encoding="utf8") as outfile:`
			`json.dump(previous_seen, outfile)`


			`do_find_bolig_things()`
Added taekker real estate 2018-10-11 22:59:11 +00:00			`do_taeker_things()`
			`do_hestia_things()`
Initial commit 2018-10-09 19:57:47 +00:00