Initial scraping of valid IDs and names

Saves a pickled dictionary containing valid IDs and their names.
This commit is contained in:
Johannes Rothe 2022-01-14 21:25:40 +01:00
parent e8ddc823a7
commit 12f106602a

47
find_targets.py Normal file
View File

@ -0,0 +1,47 @@
import asyncio
import pickle
from httpx import AsyncClient
from typing import Tuple
from bs4 import BeautifulSoup
MAIN_URL = (
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
)
DATA_URL = (
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
)
FROM = 0
TO = 2000
async def request_async(index: int, client: AsyncClient) -> Tuple:
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
if "Fehler aufgetreten" not in response_data.text:
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
soup = BeautifulSoup(response_title.text, "lxml")
# h1 contains the house name, h2 the apartment
apartment = soup.body.header.h1.get_text()
unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
name = f"{apartment} - {unit}"
return index, name
else:
return 0, ""
async def get_valid_ids():
client = AsyncClient()
results = dict(
await asyncio.gather(
*[request_async(i, client) for i in range(FROM, TO)]
)
)
valid = dict(filter(lambda item: item[0] != 0, results.items()))
print(f"Valid ids: {valid}")
await client.aclose()
with open("valid_ids", "wb") as file:
pickle.dump(valid, file)
if __name__ == "__main__":
# print(pickle.load(open("valid_ids", "rb")))
asyncio.run(get_valid_ids())