import asyncio import locale import pickle from datetime import datetime from typing import Tuple from bs4 import BeautifulSoup from httpx import AsyncClient MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" FROM = 100 TO = 120 STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2} def convert_to_datestring(day: str, month: str, year: str) -> str: locale.setlocale(locale.LC_TIME, "de_DE") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") return date.strftime('%Y-%m-%d') async def request_data(index: int, client: AsyncClient) -> Tuple: response_data = await client.get(DATA_URL + str(index), timeout=20.0) if "Fehler aufgetreten" not in response_data.text: response_title = await client.get(MAIN_URL + str(index), timeout=20.0) title_soup = BeautifulSoup(response_title.text, "lxml") apartment = title_soup.body.header.h1.get_text() unit = ( title_soup.body.header.h2.get_text() .replace("\xa0", " ") .replace("Wohneinheit: ", "") ) data_soup = BeautifulSoup(response_data.text, "lxml") valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) days = [] for elm in valid_element: day = elm.get_text(strip=True) date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) month = date_raw.split(" ")[0] year = date_raw.split(" ")[1] status = STATUS_MAPPING[elm["class"][0]] date = convert_to_datestring(day, month, year) days.append({date: status}) return index, apartment, unit, days else: return 0, "" async def extract_results() -> None: client = AsyncClient() results = await asyncio.gather(*[request_data(i, client) for i in range(FROM, TO)]) # Filter the invalid units valid = list(filter(lambda item: item[0] != 0, results)) #print(f"results: {valid}") await client.aclose() with open("valid_ids", "wb") as file: pickle.dump(valid, file) if __name__ == "__main__": # print(pickle.load(open("valid_ids", "rb"))) asyncio.run(extract_results())