Initial scraping of valid IDs and names
Saves a pickled dictionary containing valid IDs and their names.
This commit is contained in:
parent
e8ddc823a7
commit
12f106602a
47
find_targets.py
Normal file
47
find_targets.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import asyncio
|
||||||
|
import pickle
|
||||||
|
from httpx import AsyncClient
|
||||||
|
from typing import Tuple
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
MAIN_URL = (
|
||||||
|
"https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
|
||||||
|
)
|
||||||
|
DATA_URL = (
|
||||||
|
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
|
||||||
|
)
|
||||||
|
FROM = 0
|
||||||
|
TO = 2000
|
||||||
|
|
||||||
|
|
||||||
|
async def request_async(index: int, client: AsyncClient) -> Tuple:
|
||||||
|
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
||||||
|
if "Fehler aufgetreten" not in response_data.text:
|
||||||
|
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||||
|
soup = BeautifulSoup(response_title.text, "lxml")
|
||||||
|
# h1 contains the house name, h2 the apartment
|
||||||
|
apartment = soup.body.header.h1.get_text()
|
||||||
|
unit = soup.body.header.h2.get_text().replace(u'\xa0',u' ')
|
||||||
|
name = f"{apartment} - {unit}"
|
||||||
|
return index, name
|
||||||
|
else:
|
||||||
|
return 0, ""
|
||||||
|
|
||||||
|
|
||||||
|
async def get_valid_ids():
|
||||||
|
client = AsyncClient()
|
||||||
|
results = dict(
|
||||||
|
await asyncio.gather(
|
||||||
|
*[request_async(i, client) for i in range(FROM, TO)]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
valid = dict(filter(lambda item: item[0] != 0, results.items()))
|
||||||
|
print(f"Valid ids: {valid}")
|
||||||
|
await client.aclose()
|
||||||
|
with open("valid_ids", "wb") as file:
|
||||||
|
pickle.dump(valid, file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# print(pickle.load(open("valid_ids", "rb")))
|
||||||
|
asyncio.run(get_valid_ids())
|
Loading…
x
Reference in New Issue
Block a user