Scrape the days and availability

2022-01-14 22:43:23 +01:00 · 2022-01-14 22:43:23 +01:00 · 31403c2e5d
commit 31403c2e5d
parent 12f106602a
2 changed files with 63 additions and 47 deletions
--- a/find_targets.py
+++ b/find_targets.py
@ -1,47 +0,0 @@
-import asyncio
-import pickle
-from httpx import AsyncClient
-from typing import Tuple
-from bs4 import BeautifulSoup
-
-MAIN_URL = (
-    "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
-)
-DATA_URL = (
-    "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
-)
-FROM = 0
-TO = 2000
-
-
-async def request_async(index: int, client: AsyncClient) -> Tuple:
-    response_data = await client.get(DATA_URL + str(index), timeout=20.0)
-    if "Fehler aufgetreten" not in response_data.text:
-        response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
-        soup = BeautifulSoup(response_title.text, "lxml")
-        # h1 contains the house name, h2 the apartment
-        apartment = soup.body.header.h1.get_text()
-        unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
-        name = f"{apartment} - {unit}"
-        return index, name
-    else:
-        return 0, ""
-
-
-async def get_valid_ids():
-    client = AsyncClient()
-    results = dict(
-        await asyncio.gather(
-            *[request_async(i, client) for i in range(FROM, TO)]
-        )
-    )
-    valid = dict(filter(lambda item: item[0] != 0, results.items()))
-    print(f"Valid ids: {valid}")
-    await client.aclose()
-    with open("valid_ids", "wb") as file:
-        pickle.dump(valid, file)
-
-
-if __name__ == "__main__":
-    # print(pickle.load(open("valid_ids", "rb")))
-    asyncio.run(get_valid_ids())
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,63 @@
+import asyncio
+import locale
+import pickle
+from datetime import datetime
+from typing import Tuple
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+
+MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
+DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
+FROM = 100
+TO = 120
+STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2}
+
+def convert_to_datestring(day: str, month: str, year: str) -> str:
+    locale.setlocale(locale.LC_TIME, "de_DE")
+    date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
+    return date.strftime('%Y-%m-%d')
+
+
+async def request_data(index: int, client: AsyncClient) -> Tuple:
+    response_data = await client.get(DATA_URL + str(index), timeout=20.0)
+    if "Fehler aufgetreten" not in response_data.text:
+        response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
+        title_soup = BeautifulSoup(response_title.text, "lxml")
+        apartment = title_soup.body.header.h1.get_text()
+        unit = (
+            title_soup.body.header.h2.get_text()
+            .replace("\xa0", " ")
+            .replace("Wohneinheit: ", "")
+        )
+
+        data_soup = BeautifulSoup(response_data.text, "lxml")
+        valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
+        days = []
+        for elm in valid_element:
+            day = elm.get_text(strip=True)
+            date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
+            month = date_raw.split(" ")[0]
+            year = date_raw.split(" ")[1]
+            status = STATUS_MAPPING[elm["class"][0]]
+            date = convert_to_datestring(day, month, year)
+            days.append({date: status})
+        return index, apartment, unit, days
+    else:
+        return 0, ""
+
+
+async def extract_results() -> None:
+    client = AsyncClient()
+    results = await asyncio.gather(*[request_data(i, client) for i in range(FROM, TO)])
+    # Filter the invalid units
+    valid = list(filter(lambda item: item[0] != 0, results))
+    #print(f"results: {valid}")
+    await client.aclose()
+    with open("valid_ids", "wb") as file:
+        pickle.dump(valid, file)
+
+
+if __name__ == "__main__":
+    # print(pickle.load(open("valid_ids", "rb")))
+    asyncio.run(extract_results())