scraperoog/scrape.py

120 lines
3.6 KiB
Python

import asyncio
import csv
import json
import locale
import pickle
from datetime import datetime
from typing import List, Optional, Tuple
from bs4 import BeautifulSoup
from httpx import AsyncClient
from pydantic import BaseModel
MAIN_URL = (
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
)
DATA_URL = (
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
)
FROM = 0
TO = 2000
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2}
class Availability(BaseModel):
date: datetime
status: int
class Entry(BaseModel):
index: int
haus: Optional[str]
wohneinheit: Optional[str]
availabilities: Optional[List[Availability]]
class Result(BaseModel):
entries: List[Entry]
def generate_csv(result: Result) -> None:
with open("result.csv", "w") as csvfile:
fieldnames = list(Entry.schema()["properties"].keys())
fieldnames.remove("availabilities")
fieldnames.extend(
[
a.date.strftime("%Y-%m-%d")
for a in result.entries[0].availabilities
]
)
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
csvwriter.writeheader()
for entry in result.entries:
row_content = {
"index": entry.index,
"haus": entry.haus,
"wohneinheit": entry.wohneinheit,
}
for avail in entry.availabilities:
row_content[avail.date.strftime("%Y-%m-%d")] = avail.status
csvwriter.writerow(row_content)
def convert_to_datestring(day: str, month: str, year: str) -> datetime:
locale.setlocale(locale.LC_TIME, "de_DE.utf8")
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
return date
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
if "Fehler aufgetreten" not in response_data.text:
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
title_soup = BeautifulSoup(response_title.text, "lxml")
apartment = (
title_soup.body.header.h2.get_text()
.replace("\xa0", " ")
.replace("Wohneinheit: ", "")
)
data_soup = BeautifulSoup(response_data.text, "lxml")
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
availabilities = []
for elm in valid_element:
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
status = STATUS_MAPPING[elm["class"][0]]
date = convert_to_datestring(
elm.get_text(strip=True),
date_raw.split(" ")[0],
date_raw.split(" ")[1],
)
availabilities.append(Availability(date=date, status=status))
return Entry(
index=index,
haus=title_soup.body.header.h1.get_text(),
wohneinheit=apartment,
availabilities=availabilities,
)
else:
return Entry(index=0)
async def extract_results() -> None:
client = AsyncClient()
entries = await asyncio.gather(
*[request_data(i, client) for i in range(FROM, TO)]
)
entries = list(filter(lambda entry: entry.index != 0, entries))
result = Result(entries=entries)
await client.aclose()
with open("results.json", "w") as file:
file.write(result.json())
if __name__ == "__main__":
with open("results.json", "r") as file:
result = Result(**json.load(file))
generate_csv(result)
#asyncio.run(extract_results())