Scrape the days and availability
This commit is contained in:
parent
12f106602a
commit
31403c2e5d
@ -1,47 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import pickle
|
|
||||||
from httpx import AsyncClient
|
|
||||||
from typing import Tuple
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
MAIN_URL = (
|
|
||||||
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
|
|
||||||
)
|
|
||||||
DATA_URL = (
|
|
||||||
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
|
|
||||||
)
|
|
||||||
FROM = 0
|
|
||||||
TO = 2000
|
|
||||||
|
|
||||||
|
|
||||||
async def request_async(index: int, client: AsyncClient) -> Tuple:
|
|
||||||
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
|
||||||
if "Fehler aufgetreten" not in response_data.text:
|
|
||||||
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
|
||||||
soup = BeautifulSoup(response_title.text, "lxml")
|
|
||||||
# h1 contains the house name, h2 the apartment
|
|
||||||
apartment = soup.body.header.h1.get_text()
|
|
||||||
unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
|
|
||||||
name = f"{apartment} - {unit}"
|
|
||||||
return index, name
|
|
||||||
else:
|
|
||||||
return 0, ""
|
|
||||||
|
|
||||||
|
|
||||||
async def get_valid_ids():
|
|
||||||
client = AsyncClient()
|
|
||||||
results = dict(
|
|
||||||
await asyncio.gather(
|
|
||||||
*[request_async(i, client) for i in range(FROM, TO)]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
valid = dict(filter(lambda item: item[0] != 0, results.items()))
|
|
||||||
print(f"Valid ids: {valid}")
|
|
||||||
await client.aclose()
|
|
||||||
with open("valid_ids", "wb") as file:
|
|
||||||
pickle.dump(valid, file)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# print(pickle.load(open("valid_ids", "rb")))
|
|
||||||
asyncio.run(get_valid_ids())
|
|
63
scrape.py
Normal file
63
scrape.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import asyncio
|
||||||
|
import locale
|
||||||
|
import pickle
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from httpx import AsyncClient
|
||||||
|
|
||||||
|
MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
|
||||||
|
DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
|
||||||
|
FROM = 100
|
||||||
|
TO = 120
|
||||||
|
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2}
|
||||||
|
|
||||||
|
def convert_to_datestring(day: str, month: str, year: str) -> str:
|
||||||
|
locale.setlocale(locale.LC_TIME, "de_DE")
|
||||||
|
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
|
||||||
|
return date.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
|
||||||
|
async def request_data(index: int, client: AsyncClient) -> Tuple:
|
||||||
|
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
||||||
|
if "Fehler aufgetreten" not in response_data.text:
|
||||||
|
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||||
|
title_soup = BeautifulSoup(response_title.text, "lxml")
|
||||||
|
apartment = title_soup.body.header.h1.get_text()
|
||||||
|
unit = (
|
||||||
|
title_soup.body.header.h2.get_text()
|
||||||
|
.replace("\xa0", " ")
|
||||||
|
.replace("Wohneinheit: ", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
data_soup = BeautifulSoup(response_data.text, "lxml")
|
||||||
|
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
|
||||||
|
days = []
|
||||||
|
for elm in valid_element:
|
||||||
|
day = elm.get_text(strip=True)
|
||||||
|
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
|
||||||
|
month = date_raw.split(" ")[0]
|
||||||
|
year = date_raw.split(" ")[1]
|
||||||
|
status = STATUS_MAPPING[elm["class"][0]]
|
||||||
|
date = convert_to_datestring(day, month, year)
|
||||||
|
days.append({date: status})
|
||||||
|
return index, apartment, unit, days
|
||||||
|
else:
|
||||||
|
return 0, ""
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_results() -> None:
|
||||||
|
client = AsyncClient()
|
||||||
|
results = await asyncio.gather(*[request_data(i, client) for i in range(FROM, TO)])
|
||||||
|
# Filter the invalid units
|
||||||
|
valid = list(filter(lambda item: item[0] != 0, results))
|
||||||
|
#print(f"results: {valid}")
|
||||||
|
await client.aclose()
|
||||||
|
with open("valid_ids", "wb") as file:
|
||||||
|
pickle.dump(valid, file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# print(pickle.load(open("valid_ids", "rb")))
|
||||||
|
asyncio.run(extract_results())
|
Loading…
x
Reference in New Issue
Block a user