Initial scraping of valid IDs and names
Saves a pickled dictionary containing valid IDs and their names.
This commit is contained in:
parent
e8ddc823a7
commit
12f106602a
47
find_targets.py
Normal file
47
find_targets.py
Normal file
@ -0,0 +1,47 @@
|
||||
import asyncio
|
||||
import pickle
|
||||
from httpx import AsyncClient
|
||||
from typing import Tuple
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
MAIN_URL = (
|
||||
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
|
||||
)
|
||||
DATA_URL = (
|
||||
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
|
||||
)
|
||||
FROM = 0
|
||||
TO = 2000
|
||||
|
||||
|
||||
async def request_async(index: int, client: AsyncClient) -> Tuple:
|
||||
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
||||
if "Fehler aufgetreten" not in response_data.text:
|
||||
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||
soup = BeautifulSoup(response_title.text, "lxml")
|
||||
# h1 contains the house name, h2 the apartment
|
||||
apartment = soup.body.header.h1.get_text()
|
||||
unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
|
||||
name = f"{apartment} - {unit}"
|
||||
return index, name
|
||||
else:
|
||||
return 0, ""
|
||||
|
||||
|
||||
async def get_valid_ids():
|
||||
client = AsyncClient()
|
||||
results = dict(
|
||||
await asyncio.gather(
|
||||
*[request_async(i, client) for i in range(FROM, TO)]
|
||||
)
|
||||
)
|
||||
valid = dict(filter(lambda item: item[0] != 0, results.items()))
|
||||
print(f"Valid ids: {valid}")
|
||||
await client.aclose()
|
||||
with open("valid_ids", "wb") as file:
|
||||
pickle.dump(valid, file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# print(pickle.load(open("valid_ids", "rb")))
|
||||
asyncio.run(get_valid_ids())
|
Loading…
x
Reference in New Issue
Block a user