From 8f4fc97da088998c44fd38027fe8b9bf6a31da9a Mon Sep 17 00:00:00 2001 From: Johannes Rothe Date: Sat, 15 Jan 2022 13:43:58 +0100 Subject: [PATCH] Filter out none results and start cleaning up --- README.md | 2 ++ scrape.py | 56 ++++++++++++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8417393..f237898 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # scraperoog +`apt install python3-pydantic python3-httpx python3-bs4 locales-all` +`sudo locale-gen` diff --git a/scrape.py b/scrape.py index 0b8d996..a138606 100644 --- a/scrape.py +++ b/scrape.py @@ -10,10 +10,14 @@ from bs4 import BeautifulSoup from httpx import AsyncClient from pydantic import BaseModel -MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" -DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" -FROM = 110 -TO = 120 +MAIN_URL = ( + "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" +) +DATA_URL = ( + "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" +) +FROM = 0 +TO = 2000 STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2} @@ -24,9 +28,9 @@ class Availability(BaseModel): class Entry(BaseModel): index: int - haus: str - wohneinheit: str - availabilities: List[Availability] + haus: Optional[str] + wohneinheit: Optional[str] + availabilities: Optional[List[Availability]] class Result(BaseModel): @@ -37,18 +41,27 @@ def generate_csv(result: Result) -> None: with open("result.csv", "w") as csvfile: fieldnames = list(Entry.schema()["properties"].keys()) fieldnames.remove("availabilities") - fieldnames.extend([a.date.strftime("%Y-%m-%d") for a in result.entries[0].availabilities]) + fieldnames.extend( + [ + a.date.strftime("%Y-%m-%d") + for a in result.entries[0].availabilities + ] + ) csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter.writeheader() for entry in result.entries: - row_content = {"index": entry.index, "haus": entry.haus, "wohneinheit": entry.wohneinheit} + row_content = { + "index": entry.index, + "haus": entry.haus, + "wohneinheit": entry.wohneinheit, + } for avail in entry.availabilities: row_content[avail.date.strftime("%Y-%m-%d")] = avail.status csvwriter.writerow(row_content) def convert_to_datestring(day: str, month: str, year: str) -> datetime: - locale.setlocale(locale.LC_TIME, "de_DE") + locale.setlocale(locale.LC_TIME, "de_DE.utf8") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") return date @@ -71,7 +84,9 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) status = STATUS_MAPPING[elm["class"][0]] date = convert_to_datestring( - elm.get_text(strip=True), date_raw.split(" ")[0], date_raw.split(" ")[1] + elm.get_text(strip=True), + date_raw.split(" ")[0], + date_raw.split(" ")[1], ) availabilities.append(Availability(date=date, status=status)) return Entry( @@ -81,29 +96,24 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: availabilities=availabilities, ) else: - return None + return Entry(index=0) async def extract_results() -> None: client = AsyncClient() - result = Result( - entries=await asyncio.gather( - *[request_data(i, client) for i in range(FROM, TO)] - ) + entries = await asyncio.gather( + *[request_data(i, client) for i in range(FROM, TO)] ) - # Filter the invalid units - # valid = list(filter(lambda item: item[0] != 0, results)) - # print(f"results: {valid}") + entries = list(filter(lambda entry: entry.index != 0, entries)) + result = Result(entries=entries) await client.aclose() - print(result.json()) with open("results.json", "w") as file: file.write(result.json()) if __name__ == "__main__": with open("results.json", "r") as file: - result = Result(**json.load(file)) - print(result.json()) + result = Result(**json.load(file)) generate_csv(result) - # asyncio.run(extract_results()) + #asyncio.run(extract_results())