From 31403c2e5de0ee8f56a9b61d11d10fcf2187f92b Mon Sep 17 00:00:00 2001 From: Johannes Rothe Date: Fri, 14 Jan 2022 22:43:23 +0100 Subject: [PATCH] Scrape the days and availability --- find_targets.py | 47 ------------------------------------ scrape.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 47 deletions(-) delete mode 100644 find_targets.py create mode 100644 scrape.py diff --git a/find_targets.py b/find_targets.py deleted file mode 100644 index b2b58c9..0000000 --- a/find_targets.py +++ /dev/null @@ -1,47 +0,0 @@ -import asyncio -import pickle -from httpx import AsyncClient -from typing import Tuple -from bs4 import BeautifulSoup - -MAIN_URL = ( - "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" -) -DATA_URL = ( - "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" -) -FROM = 0 -TO = 2000 - - -async def request_async(index: int, client: AsyncClient) -> Tuple: - response_data = await client.get(DATA_URL + str(index), timeout=20.0) - if "Fehler aufgetreten" not in response_data.text: - response_title = await client.get(MAIN_URL + str(index), timeout=20.0) - soup = BeautifulSoup(response_title.text, "lxml") - # h1 contains the house name, h2 the apartment - apartment = soup.body.header.h1.get_text() - unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ') - name = f"{apartment} - {unit}" - return index, name - else: - return 0, "" - - -async def get_valid_ids(): - client = AsyncClient() - results = dict( - await asyncio.gather( - *[request_async(i, client) for i in range(FROM, TO)] - ) - ) - valid = dict(filter(lambda item: item[0] != 0, results.items())) - print(f"Valid ids: {valid}") - await client.aclose() - with open("valid_ids", "wb") as file: - pickle.dump(valid, file) - - -if __name__ == "__main__": - # print(pickle.load(open("valid_ids", "rb"))) - asyncio.run(get_valid_ids()) diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..1eae8e5 --- /dev/null +++ b/scrape.py @@ -0,0 +1,63 @@ +import asyncio +import locale +import pickle +from datetime import datetime +from typing import Tuple + +from bs4 import BeautifulSoup +from httpx import AsyncClient + +MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" +DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" +FROM = 100 +TO = 120 +STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2} + +def convert_to_datestring(day: str, month: str, year: str) -> str: + locale.setlocale(locale.LC_TIME, "de_DE") + date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") + return date.strftime('%Y-%m-%d') + + +async def request_data(index: int, client: AsyncClient) -> Tuple: + response_data = await client.get(DATA_URL + str(index), timeout=20.0) + if "Fehler aufgetreten" not in response_data.text: + response_title = await client.get(MAIN_URL + str(index), timeout=20.0) + title_soup = BeautifulSoup(response_title.text, "lxml") + apartment = title_soup.body.header.h1.get_text() + unit = ( + title_soup.body.header.h2.get_text() + .replace("\xa0", " ") + .replace("Wohneinheit: ", "") + ) + + data_soup = BeautifulSoup(response_data.text, "lxml") + valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) + days = [] + for elm in valid_element: + day = elm.get_text(strip=True) + date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) + month = date_raw.split(" ")[0] + year = date_raw.split(" ")[1] + status = STATUS_MAPPING[elm["class"][0]] + date = convert_to_datestring(day, month, year) + days.append({date: status}) + return index, apartment, unit, days + else: + return 0, "" + + +async def extract_results() -> None: + client = AsyncClient() + results = await asyncio.gather(*[request_data(i, client) for i in range(FROM, TO)]) + # Filter the invalid units + valid = list(filter(lambda item: item[0] != 0, results)) + #print(f"results: {valid}") + await client.aclose() + with open("valid_ids", "wb") as file: + pickle.dump(valid, file) + + +if __name__ == "__main__": + # print(pickle.load(open("valid_ids", "rb"))) + asyncio.run(extract_results())