Add config, make windows compatible
This commit is contained in:
parent
11dd02d3c7
commit
50c344c583
5
config.ini
Normal file
5
config.ini
Normal file
@ -0,0 +1,5 @@
|
||||
[Allgemein]
|
||||
haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=
|
||||
data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=
|
||||
von_id = 200
|
||||
bis_id = 300
|
69
scrape.py
69
scrape.py
@ -3,21 +3,26 @@ import csv
|
||||
import json
|
||||
import locale
|
||||
import pickle
|
||||
import platform
|
||||
from configparser import ConfigParser
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
if platform.system() == "Windows":
|
||||
import encodings.idna
|
||||
import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
from httpx import AsyncClient
|
||||
from pydantic import BaseModel
|
||||
|
||||
MAIN_URL = (
|
||||
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
|
||||
)
|
||||
DATA_URL = (
|
||||
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
|
||||
)
|
||||
FROM = 0
|
||||
TO = 2000
|
||||
config = ConfigParser()
|
||||
config.read("config.ini")
|
||||
|
||||
MAIN_URL = config['Allgemein'].get('haupt_url')
|
||||
DATA_URL = config['Allgemein'].get('data_url')
|
||||
FROM = config['Allgemein'].getint('von_id')
|
||||
TO = config['Allgemein'].getint('bis_id')
|
||||
DATEFORMAT = "%Y-%m-%d"
|
||||
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
|
||||
|
||||
|
||||
@ -31,6 +36,17 @@ class Entry(BaseModel):
|
||||
haus: Optional[str]
|
||||
wohneinheit: Optional[str]
|
||||
availabilities: Optional[List[Availability]]
|
||||
def __lt__(self, other):
|
||||
return self.index < other.index
|
||||
|
||||
def __le__(self, other):
|
||||
return self.index <= other.index
|
||||
|
||||
def __gt__(self, other):
|
||||
return self.index > other.index
|
||||
|
||||
def __ge__(self, other):
|
||||
return self.index >= other.index
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
@ -38,14 +54,11 @@ class Result(BaseModel):
|
||||
|
||||
|
||||
def generate_csv(result: Result) -> None:
|
||||
with open("result.csv", "w") as csvfile:
|
||||
with open("result.csv", "w", newline='') as csvfile:
|
||||
fieldnames = list(Entry.schema()["properties"].keys())
|
||||
fieldnames.remove("availabilities")
|
||||
fieldnames.extend(
|
||||
[
|
||||
a.date.strftime("%Y-%m-%d")
|
||||
for a in result.entries[0].availabilities
|
||||
]
|
||||
[a.date.strftime(DATEFORMAT) for a in result.entries[0].availabilities]
|
||||
)
|
||||
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
csvwriter.writeheader()
|
||||
@ -56,12 +69,12 @@ def generate_csv(result: Result) -> None:
|
||||
"wohneinheit": entry.wohneinheit,
|
||||
}
|
||||
for avail in entry.availabilities:
|
||||
row_content[avail.date.strftime("%Y-%m-%d")] = avail.status
|
||||
row_content[avail.date.strftime(DATEFORMAT)] = avail.status
|
||||
csvwriter.writerow(row_content)
|
||||
|
||||
|
||||
def convert_to_datestring(day: str, month: str, year: str) -> datetime:
|
||||
locale.setlocale(locale.LC_TIME, "de_DE.utf8")
|
||||
locale.setlocale(locale.LC_TIME, "")
|
||||
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
|
||||
return date
|
||||
|
||||
@ -91,8 +104,8 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
|
||||
availabilities.append(Availability(date=date, status=status))
|
||||
return Entry(
|
||||
index=index,
|
||||
haus=title_soup.body.header.h1.get_text(),
|
||||
wohneinheit=apartment,
|
||||
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
|
||||
wohneinheit=apartment.encode("utf-8"),
|
||||
availabilities=availabilities,
|
||||
)
|
||||
else:
|
||||
@ -101,19 +114,23 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
|
||||
|
||||
async def extract_results() -> None:
|
||||
client = AsyncClient()
|
||||
entries = await asyncio.gather(
|
||||
*[request_data(i, client) for i in range(FROM, TO)]
|
||||
)
|
||||
entries = list(filter(lambda entry: entry.index != 0, entries))
|
||||
result = Result(entries=entries)
|
||||
tasks = [request_data(i, client) for i in range(FROM, TO)]
|
||||
entries = [
|
||||
await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks))
|
||||
]
|
||||
filtered_entries = list(filter(lambda entry: entry.index != 0, entries))
|
||||
sorted_entries = list(sorted(filtered_entries))
|
||||
result = Result(entries=sorted_entries)
|
||||
|
||||
await client.aclose()
|
||||
with open("results.json", "w") as file:
|
||||
file.write(result.json())
|
||||
generate_csv(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open("results.json", "r") as file:
|
||||
result = Result(**json.load(file))
|
||||
generate_csv(result)
|
||||
#asyncio.run(extract_results())
|
||||
# with open("results.json", "r") as file:
|
||||
# result = Result(**json.load(file))
|
||||
if platform.system() == "Windows":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||
asyncio.run(extract_results())
|
||||
|
Loading…
x
Reference in New Issue
Block a user