Filter out none results and start cleaning up

This commit is contained in:
Johannes Rothe 2022-01-15 13:43:58 +01:00
parent e485912b46
commit 8f4fc97da0
2 changed files with 35 additions and 23 deletions

View File

@ -1,2 +1,4 @@
# scraperoog # scraperoog
`apt install python3-pydantic python3-httpx python3-bs4 locales-all`
`sudo locale-gen`

View File

@ -10,10 +10,14 @@ from bs4 import BeautifulSoup
from httpx import AsyncClient from httpx import AsyncClient
from pydantic import BaseModel from pydantic import BaseModel
MAIN_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids=" MAIN_URL = (
DATA_URL = "https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=" "https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?wohnids="
FROM = 110 )
TO = 120 DATA_URL = (
"https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids="
)
FROM = 0
TO = 2000
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2} STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 2, "DayBF": 2}
@ -24,9 +28,9 @@ class Availability(BaseModel):
class Entry(BaseModel): class Entry(BaseModel):
index: int index: int
haus: str haus: Optional[str]
wohneinheit: str wohneinheit: Optional[str]
availabilities: List[Availability] availabilities: Optional[List[Availability]]
class Result(BaseModel): class Result(BaseModel):
@ -37,18 +41,27 @@ def generate_csv(result: Result) -> None:
with open("result.csv", "w") as csvfile: with open("result.csv", "w") as csvfile:
fieldnames = list(Entry.schema()["properties"].keys()) fieldnames = list(Entry.schema()["properties"].keys())
fieldnames.remove("availabilities") fieldnames.remove("availabilities")
fieldnames.extend([a.date.strftime("%Y-%m-%d") for a in result.entries[0].availabilities]) fieldnames.extend(
[
a.date.strftime("%Y-%m-%d")
for a in result.entries[0].availabilities
]
)
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
csvwriter.writeheader() csvwriter.writeheader()
for entry in result.entries: for entry in result.entries:
row_content = {"index": entry.index, "haus": entry.haus, "wohneinheit": entry.wohneinheit} row_content = {
"index": entry.index,
"haus": entry.haus,
"wohneinheit": entry.wohneinheit,
}
for avail in entry.availabilities: for avail in entry.availabilities:
row_content[avail.date.strftime("%Y-%m-%d")] = avail.status row_content[avail.date.strftime("%Y-%m-%d")] = avail.status
csvwriter.writerow(row_content) csvwriter.writerow(row_content)
def convert_to_datestring(day: str, month: str, year: str) -> datetime: def convert_to_datestring(day: str, month: str, year: str) -> datetime:
locale.setlocale(locale.LC_TIME, "de_DE") locale.setlocale(locale.LC_TIME, "de_DE.utf8")
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
return date return date
@ -71,7 +84,9 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True) date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
status = STATUS_MAPPING[elm["class"][0]] status = STATUS_MAPPING[elm["class"][0]]
date = convert_to_datestring( date = convert_to_datestring(
elm.get_text(strip=True), date_raw.split(" ")[0], date_raw.split(" ")[1] elm.get_text(strip=True),
date_raw.split(" ")[0],
date_raw.split(" ")[1],
) )
availabilities.append(Availability(date=date, status=status)) availabilities.append(Availability(date=date, status=status))
return Entry( return Entry(
@ -81,22 +96,18 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
availabilities=availabilities, availabilities=availabilities,
) )
else: else:
return None return Entry(index=0)
async def extract_results() -> None: async def extract_results() -> None:
client = AsyncClient() client = AsyncClient()
result = Result( entries = await asyncio.gather(
entries=await asyncio.gather(
*[request_data(i, client) for i in range(FROM, TO)] *[request_data(i, client) for i in range(FROM, TO)]
) )
) entries = list(filter(lambda entry: entry.index != 0, entries))
# Filter the invalid units result = Result(entries=entries)
# valid = list(filter(lambda item: item[0] != 0, results))
# print(f"results: {valid}")
await client.aclose() await client.aclose()
print(result.json())
with open("results.json", "w") as file: with open("results.json", "w") as file:
file.write(result.json()) file.write(result.json())
@ -104,6 +115,5 @@ async def extract_results() -> None:
if __name__ == "__main__": if __name__ == "__main__":
with open("results.json", "r") as file: with open("results.json", "r") as file:
result = Result(**json.load(file)) result = Result(**json.load(file))
print(result.json())
generate_csv(result) generate_csv(result)
# asyncio.run(extract_results()) #asyncio.run(extract_results())