scraperoog/scrape.py

137 lines
4.3 KiB
Python
Raw Permalink Normal View History

2022-01-14 22:43:23 +01:00
import asyncio
2022-01-15 00:10:00 +01:00
import csv
import json
2022-01-14 22:43:23 +01:00
import locale
import pickle
2022-01-16 22:15:19 +01:00
import platform
from configparser import ConfigParser
2022-01-14 22:43:23 +01:00
from datetime import datetime
2022-01-15 00:10:00 +01:00
from typing import List, Optional, Tuple
2022-01-14 22:43:23 +01:00
2022-01-16 22:15:19 +01:00
if platform.system() == "Windows":
import encodings.idna
import tqdm
2022-01-14 22:43:23 +01:00
from bs4 import BeautifulSoup
from httpx import AsyncClient
2022-01-15 00:10:00 +01:00
from pydantic import BaseModel
2022-01-14 22:43:23 +01:00
2022-01-16 22:15:19 +01:00
config = ConfigParser()
config.read("config.ini")
MAIN_URL = config['Allgemein'].get('haupt_url')
DATA_URL = config['Allgemein'].get('data_url')
FROM = config['Allgemein'].getint('von_id')
TO = config['Allgemein'].getint('bis_id')
DATEFORMAT = "%Y-%m-%d"
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
2022-01-14 22:43:23 +01:00
2022-01-15 00:10:00 +01:00
class Availability(BaseModel):
date: datetime
status: float
2022-01-15 00:10:00 +01:00
class Entry(BaseModel):
index: int
haus: Optional[str]
wohneinheit: Optional[str]
availabilities: Optional[List[Availability]]
2022-01-16 22:15:19 +01:00
def __lt__(self, other):
return self.index < other.index
def __le__(self, other):
return self.index <= other.index
def __gt__(self, other):
return self.index > other.index
def __ge__(self, other):
return self.index >= other.index
2022-01-15 00:10:00 +01:00
class Result(BaseModel):
entries: List[Entry]
def generate_csv(result: Result) -> None:
2022-01-16 22:15:19 +01:00
with open("result.csv", "w", newline='') as csvfile:
2022-01-15 00:10:00 +01:00
fieldnames = list(Entry.schema()["properties"].keys())
fieldnames.remove("availabilities")
fieldnames.extend(
2022-01-16 22:15:19 +01:00
[a.date.strftime(DATEFORMAT) for a in result.entries[0].availabilities]
)
2022-01-15 00:10:00 +01:00
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
csvwriter.writeheader()
for entry in result.entries:
row_content = {
"index": entry.index,
"haus": entry.haus,
"wohneinheit": entry.wohneinheit,
}
2022-01-15 00:10:00 +01:00
for avail in entry.availabilities:
2022-01-16 22:15:19 +01:00
row_content[avail.date.strftime(DATEFORMAT)] = avail.status
2022-01-15 00:10:00 +01:00
csvwriter.writerow(row_content)
def convert_to_datestring(day: str, month: str, year: str) -> datetime:
2022-01-16 22:15:19 +01:00
locale.setlocale(locale.LC_TIME, "")
2022-01-14 22:43:23 +01:00
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
2022-01-15 00:10:00 +01:00
return date
2022-01-14 22:43:23 +01:00
2022-01-15 00:10:00 +01:00
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
2022-01-14 22:43:23 +01:00
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
if "Fehler aufgetreten" not in response_data.text:
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
title_soup = BeautifulSoup(response_title.text, "lxml")
2022-01-15 00:10:00 +01:00
apartment = (
2022-01-14 22:43:23 +01:00
title_soup.body.header.h2.get_text()
.replace("\xa0", " ")
.replace("Wohneinheit: ", "")
)
data_soup = BeautifulSoup(response_data.text, "lxml")
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
2022-01-15 00:10:00 +01:00
availabilities = []
2022-01-14 22:43:23 +01:00
for elm in valid_element:
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
status = STATUS_MAPPING[elm["class"][0]]
2022-01-15 00:10:00 +01:00
date = convert_to_datestring(
elm.get_text(strip=True),
date_raw.split(" ")[0],
date_raw.split(" ")[1],
2022-01-15 00:10:00 +01:00
)
availabilities.append(Availability(date=date, status=status))
return Entry(
index=index,
2022-01-16 22:15:19 +01:00
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
wohneinheit=apartment.encode("utf-8"),
2022-01-15 00:10:00 +01:00
availabilities=availabilities,
)
2022-01-14 22:43:23 +01:00
else:
return Entry(index=0)
2022-01-14 22:43:23 +01:00
async def extract_results() -> None:
client = AsyncClient()
2022-01-16 22:15:19 +01:00
tasks = [request_data(i, client) for i in range(FROM, TO)]
entries = [
await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks))
]
filtered_entries = list(filter(lambda entry: entry.index != 0, entries))
sorted_entries = list(sorted(filtered_entries))
result = Result(entries=sorted_entries)
2022-01-15 00:10:00 +01:00
2022-01-14 22:43:23 +01:00
await client.aclose()
2022-01-15 00:10:00 +01:00
with open("results.json", "w") as file:
file.write(result.json())
2022-01-16 22:15:19 +01:00
generate_csv(result)
2022-01-14 22:43:23 +01:00
if __name__ == "__main__":
2022-01-16 22:15:19 +01:00
# with open("results.json", "r") as file:
# result = Result(**json.load(file))
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(extract_results())