Compare commits

...

6 Commits

5 changed files with 64 additions and 37 deletions

View File

@ -1,4 +1,12 @@
# scraperoog # Requirements
`apt install python3-pydantic python3-httpx python3-bs4 locales-all` `apt install python3-pydantic python3-httpx python3-bs4 python3-tqdm locales-all && sudo locale-gen`
`sudo locale-gen`
# Windows build
1. Install wine
2. `winecfg` and configure windows 10
3. Install [python](https://www.python.org/downloads/windows/)
4. Install pyinstaller in wine `wine pip install pyinstaller`
5. Create venv and `pip install .`
6. `wine pyinstaller --paths=venv/lib/python3.10/site-packages/ --onefile scraperoog/scrape.py`
7. .exe is found under dist/

View File

@ -3,3 +3,4 @@ haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?woh
data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids= data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=
von_id = 200 von_id = 200
bis_id = 300 bis_id = 300
id_fehler = nicht aktiv

15
pyproject.toml Normal file
View File

@ -0,0 +1,15 @@
[project]
name = "scraperoog"
version = "0.0.5"
dependencies = [
"tqdm",
"bs4",
"httpx",
"pydantic",
]
[tool.setuptools.packages]
find = {}
[project.scripts]
scraperoog = "scraperoog.scrape:main"

0
scraperoog/__init__.py Normal file
View File

View File

@ -1,9 +1,11 @@
import asyncio import asyncio
import csv import csv
import json
import locale import locale
import pickle
import platform import platform
if platform.system() == "Windows":
locale.setlocale(locale.LC_TIME, "German")
else:
locale.setlocale(locale.LC_TIME, "de_DE.utf_8")
from configparser import ConfigParser from configparser import ConfigParser
from datetime import datetime from datetime import datetime
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@ -22,6 +24,7 @@ MAIN_URL = config['Allgemein'].get('haupt_url')
DATA_URL = config['Allgemein'].get('data_url') DATA_URL = config['Allgemein'].get('data_url')
FROM = config['Allgemein'].getint('von_id') FROM = config['Allgemein'].getint('von_id')
TO = config['Allgemein'].getint('bis_id') TO = config['Allgemein'].getint('bis_id')
ID_NONEXISTANT = config['Allgemein'].get('id_fehler')
DATEFORMAT = "%Y-%m-%d" DATEFORMAT = "%Y-%m-%d"
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5} STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
@ -74,23 +77,23 @@ def generate_csv(result: Result) -> None:
def convert_to_datestring(day: str, month: str, year: str) -> datetime: def convert_to_datestring(day: str, month: str, year: str) -> datetime:
locale.setlocale(locale.LC_TIME, "")
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y") date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
return date return date
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]: async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
response_data = await client.get(DATA_URL + str(index), timeout=20.0) response_data = await client.get(DATA_URL + str(index), timeout=20.0)
if "Fehler aufgetreten" not in response_data.text: if ID_NONEXISTANT in response_data.text:
return Entry(index=0)
response_title = await client.get(MAIN_URL + str(index), timeout=20.0) response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
title_soup = BeautifulSoup(response_title.text, "lxml") title_soup = BeautifulSoup(response_title.text, "html.parser")
apartment = ( apartment = (
title_soup.body.header.h2.get_text() title_soup.body.header.h2.get_text()
.replace("\xa0", " ") .replace("\xa0", " ")
.replace("Wohneinheit: ", "") .replace("Wohneinheit: ", "")
) )
data_soup = BeautifulSoup(response_data.text, "lxml") data_soup = BeautifulSoup(response_data.text, "html.parser")
valid_element = data_soup.find_all("td", attrs={"data-daynum": True}) valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
availabilities = [] availabilities = []
for elm in valid_element: for elm in valid_element:
@ -108,8 +111,6 @@ async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
wohneinheit=apartment.encode("utf-8"), wohneinheit=apartment.encode("utf-8"),
availabilities=availabilities, availabilities=availabilities,
) )
else:
return Entry(index=0)
async def extract_results() -> None: async def extract_results() -> None:
@ -127,10 +128,12 @@ async def extract_results() -> None:
file.write(result.json()) file.write(result.json())
generate_csv(result) generate_csv(result)
def main() -> None:
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(extract_results())
if __name__ == "__main__": if __name__ == "__main__":
# with open("results.json", "r") as file: # with open("results.json", "r") as file:
# result = Result(**json.load(file)) # result = Result(**json.load(file))
if platform.system() == "Windows": main()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(extract_results())