diff --git a/config.ini b/config.ini index e2a9363..1227b9a 100644 --- a/config.ini +++ b/config.ini @@ -3,3 +3,4 @@ haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?woh data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids= von_id = 200 bis_id = 300 +id_fehler = nicht aktiv diff --git a/scraperoog/scrape.py b/scraperoog/scrape.py index a9989f9..70a098b 100644 --- a/scraperoog/scrape.py +++ b/scraperoog/scrape.py @@ -24,6 +24,7 @@ MAIN_URL = config['Allgemein'].get('haupt_url') DATA_URL = config['Allgemein'].get('data_url') FROM = config['Allgemein'].getint('von_id') TO = config['Allgemein'].getint('bis_id') +ID_NONEXISTANT = config['Allgemein'].get('id_fehler') DATEFORMAT = "%Y-%m-%d" STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5} @@ -82,35 +83,34 @@ def convert_to_datestring(day: str, month: str, year: str) -> datetime: async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: response_data = await client.get(DATA_URL + str(index), timeout=20.0) - if "Dieser Belegungskalender ist derzeit nicht aktiv." not in response_data.text: - response_title = await client.get(MAIN_URL + str(index), timeout=20.0) - title_soup = BeautifulSoup(response_title.text, "html.parser") - apartment = ( - title_soup.body.header.h2.get_text() - .replace("\xa0", " ") - .replace("Wohneinheit: ", "") - ) - - data_soup = BeautifulSoup(response_data.text, "html.parser") - valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) - availabilities = [] - for elm in valid_element: - date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) - status = STATUS_MAPPING[elm["class"][0]] - date = convert_to_datestring( - elm.get_text(strip=True), - date_raw.split(" ")[0], - date_raw.split(" ")[1], - ) - availabilities.append(Availability(date=date, status=status)) - return Entry( - index=index, - haus=title_soup.body.header.h1.get_text().encode("utf-8"), - wohneinheit=apartment.encode("utf-8"), - availabilities=availabilities, - ) - else: + if ID_NONEXISTANT in response_data.text: return Entry(index=0) + response_title = await client.get(MAIN_URL + str(index), timeout=20.0) + title_soup = BeautifulSoup(response_title.text, "html.parser") + apartment = ( + title_soup.body.header.h2.get_text() + .replace("\xa0", " ") + .replace("Wohneinheit: ", "") + ) + + data_soup = BeautifulSoup(response_data.text, "html.parser") + valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) + availabilities = [] + for elm in valid_element: + date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) + status = STATUS_MAPPING[elm["class"][0]] + date = convert_to_datestring( + elm.get_text(strip=True), + date_raw.split(" ")[0], + date_raw.split(" ")[1], + ) + availabilities.append(Availability(date=date, status=status)) + return Entry( + index=index, + haus=title_soup.body.header.h1.get_text().encode("utf-8"), + wohneinheit=apartment.encode("utf-8"), + availabilities=availabilities, + ) async def extract_results() -> None: