Initial scraping of valid IDs and names

Saves a pickled dictionary containing valid IDs and their names.
2022-01-14 21:25:40 +01:00 · 2022-01-14 21:25:40 +01:00 · 12f106602a
commit 12f106602a
parent e8ddc823a7
1 changed files with 47 additions and 0 deletions
--- a/find_targets.py
+++ b/find_targets.py
@ -0,0 +1,47 @@
+import asyncio
+import pickle
+from httpx import AsyncClient
+from typing import Tuple
+from bs4 import BeautifulSoup
+
+MAIN_URL = (
+    "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
+)
+DATA_URL = (
+    "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
+)
+FROM = 0
+TO = 2000
+
+
+async def request_async(index: int, client: AsyncClient) -> Tuple:
+    response_data = await client.get(DATA_URL + str(index), timeout=20.0)
+    if "Fehler aufgetreten" not in response_data.text:
+        response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
+        soup = BeautifulSoup(response_title.text, "lxml")
+        # h1 contains the house name, h2 the apartment
+        apartment = soup.body.header.h1.get_text()
+        unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
+        name = f"{apartment} - {unit}"
+        return index, name
+    else:
+        return 0, ""
+
+
+async def get_valid_ids():
+    client = AsyncClient()
+    results = dict(
+        await asyncio.gather(
+            *[request_async(i, client) for i in range(FROM, TO)]
+        )
+    )
+    valid = dict(filter(lambda item: item[0] != 0, results.items()))
+    print(f"Valid ids: {valid}")
+    await client.aclose()
+    with open("valid_ids", "wb") as file:
+        pickle.dump(valid, file)
+
+
+if __name__ == "__main__":
+    # print(pickle.load(open("valid_ids", "rb")))
+    asyncio.run(get_valid_ids())