scraperoog/scrape.py

64 lines
2.3 KiB
Python

import asyncio
import locale
import pickle
from datetime import datetime
from typing import Tuple
from bs4 import BeautifulSoup
from httpx import AsyncClient
MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
FROM = 100
TO = 120
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2}
def convert_to_datestring(day: str, month: str, year: str) -> str:
locale.setlocale(locale.LC_TIME, "de_DE")
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
return date.strftime('%Y-%m-%d')
async def request_data(index: int, client: AsyncClient) -> Tuple:
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
if "Fehler aufgetreten" not in response_data.text:
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
title_soup = BeautifulSoup(response_title.text, "lxml")
apartment = title_soup.body.header.h1.get_text()
unit = (
title_soup.body.header.h2.get_text()
.replace("\xa0", " ")
.replace("Wohneinheit: ", "")
)
data_soup = BeautifulSoup(response_data.text, "lxml")
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
days = []
for elm in valid_element:
day = elm.get_text(strip=True)
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
month = date_raw.split(" ")[0]
year = date_raw.split(" ")[1]
status = STATUS_MAPPING[elm["class"][0]]
date = convert_to_datestring(day, month, year)
days.append({date: status})
return index, apartment, unit, days
else:
return 0, ""
async def extract_results() -> None:
client = AsyncClient()
results = await asyncio.gather(*[request_data(i, client) for i in range(FROM, TO)])
# Filter the invalid units
valid = list(filter(lambda item: item[0] != 0, results))
#print(f"results: {valid}")
await client.aclose()
with open("valid_ids", "wb") as file:
pickle.dump(valid, file)
if __name__ == "__main__":
# print(pickle.load(open("valid_ids", "rb")))
asyncio.run(extract_results())