From 12f106602a964a8a0f95cb069437c3d019dbcccf Mon Sep 17 00:00:00 2001 From: Johannes Rothe Date: Fri, 14 Jan 2022 21:25:40 +0100 Subject: [PATCH] Initial scraping of valid IDs and names Saves a pickled dictionary containing valid IDs and their names. --- find_targets.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 find_targets.py diff --git a/find_targets.py b/find_targets.py new file mode 100644 index 0000000..b2b58c9 --- /dev/null +++ b/find_targets.py @@ -0,0 +1,47 @@ +import asyncio +import pickle +from httpx import AsyncClient +from typing import Tuple +from bs4 import BeautifulSoup + +MAIN_URL = ( + "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" +) +DATA_URL = ( + "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" +) +FROM = 0 +TO = 2000 + + +async def request_async(index: int, client: AsyncClient) -> Tuple: + response_data = await client.get(DATA_URL + str(index), timeout=20.0) + if "Fehler aufgetreten" not in response_data.text: + response_title = await client.get(MAIN_URL + str(index), timeout=20.0) + soup = BeautifulSoup(response_title.text, "lxml") + # h1 contains the house name, h2 the apartment + apartment = soup.body.header.h1.get_text() + unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ') + name = f"{apartment} - {unit}" + return index, name + else: + return 0, "" + + +async def get_valid_ids(): + client = AsyncClient() + results = dict( + await asyncio.gather( + *[request_async(i, client) for i in range(FROM, TO)] + ) + ) + valid = dict(filter(lambda item: item[0] != 0, results.items())) + print(f"Valid ids: {valid}") + await client.aclose() + with open("valid_ids", "wb") as file: + pickle.dump(valid, file) + + +if __name__ == "__main__": + # print(pickle.load(open("valid_ids", "rb"))) + asyncio.run(get_valid_ids())