import asyncio import csv import json import locale import pickle from datetime import datetime from typing import List, Optional, Tuple from bs4 import BeautifulSoup from httpx import AsyncClient from pydantic import BaseModel MAIN_URL = ( "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" ) DATA_URL = ( "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" ) FROM = 0 TO = 2000 STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2} class Availability(BaseModel): date: datetime status: int class Entry(BaseModel): index: int haus: Optional[str] wohneinheit: Optional[str] availabilities: Optional[List[Availability]] class Result(BaseModel): entries: List[Entry] def generate_csv(result: Result) -> None: with open("result.csv", "w") as csvfile: fieldnames = list(Entry.schema()["properties"].keys()) fieldnames.remove("availabilities") fieldnames.extend( [ a.date.strftime("%Y-%m-%d") for a in result.entries[0].availabilities ] ) csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter.writeheader() for entry in result.entries: row_content = { "index": entry.index, "haus": entry.haus, "wohneinheit": entry.wohneinheit, } for avail in entry.availabilities: row_content[avail.date.strftime("%Y-%m-%d")] = avail.status csvwriter.writerow(row_content) def convert_to_datestring(day: str, month: str, year: str) -> datetime: locale.setlocale(locale.LC_TIME, "de_DE.utf8") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") return date async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: response_data = await client.get(DATA_URL + str(index), timeout=20.0) if "Fehler aufgetreten" not in response_data.text: response_title = await client.get(MAIN_URL + str(index), timeout=20.0) title_soup = BeautifulSoup(response_title.text, "lxml") apartment = ( title_soup.body.header.h2.get_text() .replace("\xa0", " ") .replace("Wohneinheit: ", "") ) data_soup = BeautifulSoup(response_data.text, "lxml") valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) availabilities = [] for elm in valid_element: date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) status = STATUS_MAPPING[elm["class"][0]] date = convert_to_datestring( elm.get_text(strip=True), date_raw.split(" ")[0], date_raw.split(" ")[1], ) availabilities.append(Availability(date=date, status=status)) return Entry( index=index, haus=title_soup.body.header.h1.get_text(), wohneinheit=apartment, availabilities=availabilities, ) else: return Entry(index=0) async def extract_results() -> None: client = AsyncClient() entries = await asyncio.gather( *[request_data(i, client) for i in range(FROM, TO)] ) entries = list(filter(lambda entry: entry.index != 0, entries)) result = Result(entries=entries) await client.aclose() with open("results.json", "w") as file: file.write(result.json()) if __name__ == "__main__": with open("results.json", "r") as file: result = Result(**json.load(file)) generate_csv(result) #asyncio.run(extract_results())