From 50c344c5837e9068d98e2e393ce64c7e2d388995 Mon Sep 17 00:00:00 2001 From: Johannes Rothe Date: Sun, 16 Jan 2022 22:15:19 +0100 Subject: [PATCH] Add config, make windows compatible --- config.ini | 5 ++++ scrape.py | 69 ++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 26 deletions(-) create mode 100644 config.ini diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..e2a9363 --- /dev/null +++ b/config.ini @@ -0,0 +1,5 @@ +[Allgemein] +haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids= +data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids= +von_id = 200 +bis_id = 300 diff --git a/scrape.py b/scrape.py index 841dd69..e1f47b5 100644 --- a/scrape.py +++ b/scrape.py @@ -3,21 +3,26 @@ import csv import json import locale import pickle +import platform +from configparser import ConfigParser from datetime import datetime from typing import List, Optional, Tuple +if platform.system() == "Windows": + import encodings.idna +import tqdm from bs4 import BeautifulSoup from httpx import AsyncClient from pydantic import BaseModel -MAIN_URL = ( - "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" -) -DATA_URL = ( - "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" -) -FROM = 0 -TO = 2000 +config = ConfigParser() +config.read("config.ini") + +MAIN_URL = config['Allgemein'].get('haupt_url') +DATA_URL = config['Allgemein'].get('data_url') +FROM = config['Allgemein'].getint('von_id') +TO = config['Allgemein'].getint('bis_id') +DATEFORMAT = "%Y-%m-%d" STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5} @@ -31,6 +36,17 @@ class Entry(BaseModel): haus: Optional[str] wohneinheit: Optional[str] availabilities: Optional[List[Availability]] + def __lt__(self, other): + return self.index < other.index + + def __le__(self, other): + return self.index <= other.index + + def __gt__(self, other): + return self.index > other.index + + def __ge__(self, other): + return self.index >= other.index class Result(BaseModel): @@ -38,14 +54,11 @@ class Result(BaseModel): def generate_csv(result: Result) -> None: - with open("result.csv", "w") as csvfile: + with open("result.csv", "w", newline='') as csvfile: fieldnames = list(Entry.schema()["properties"].keys()) fieldnames.remove("availabilities") fieldnames.extend( - [ - a.date.strftime("%Y-%m-%d") - for a in result.entries[0].availabilities - ] + [a.date.strftime(DATEFORMAT) for a in result.entries[0].availabilities] ) csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter.writeheader() @@ -56,12 +69,12 @@ def generate_csv(result: Result) -> None: "wohneinheit": entry.wohneinheit, } for avail in entry.availabilities: - row_content[avail.date.strftime("%Y-%m-%d")] = avail.status + row_content[avail.date.strftime(DATEFORMAT)] = avail.status csvwriter.writerow(row_content) def convert_to_datestring(day: str, month: str, year: str) -> datetime: - locale.setlocale(locale.LC_TIME, "de_DE.utf8") + locale.setlocale(locale.LC_TIME, "") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") return date @@ -91,8 +104,8 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: availabilities.append(Availability(date=date, status=status)) return Entry( index=index, - haus=title_soup.body.header.h1.get_text(), - wohneinheit=apartment, + haus=title_soup.body.header.h1.get_text().encode("utf-8"), + wohneinheit=apartment.encode("utf-8"), availabilities=availabilities, ) else: @@ -101,19 +114,23 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: async def extract_results() -> None: client = AsyncClient() - entries = await asyncio.gather( - *[request_data(i, client) for i in range(FROM, TO)] - ) - entries = list(filter(lambda entry: entry.index != 0, entries)) - result = Result(entries=entries) + tasks = [request_data(i, client) for i in range(FROM, TO)] + entries = [ + await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)) + ] + filtered_entries = list(filter(lambda entry: entry.index != 0, entries)) + sorted_entries = list(sorted(filtered_entries)) + result = Result(entries=sorted_entries) await client.aclose() with open("results.json", "w") as file: file.write(result.json()) + generate_csv(result) if __name__ == "__main__": - with open("results.json", "r") as file: - result = Result(**json.load(file)) - generate_csv(result) - #asyncio.run(extract_results()) + # with open("results.json", "r") as file: + # result = Result(**json.load(file)) + if platform.system() == "Windows": + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.run(extract_results())