Compare commits
No commits in common. "master" and "0.0.2" have entirely different histories.
10
README.md
10
README.md
@ -3,10 +3,6 @@
|
||||
`apt install python3-pydantic python3-httpx python3-bs4 python3-tqdm locales-all && sudo locale-gen`
|
||||
|
||||
# Windows build
|
||||
1. Install wine
|
||||
2. `winecfg` and configure windows 10
|
||||
3. Install [python](https://www.python.org/downloads/windows/)
|
||||
4. Install pyinstaller in wine `wine pip install pyinstaller`
|
||||
5. Create venv and `pip install .`
|
||||
6. `wine pyinstaller --paths=venv/lib/python3.10/site-packages/ --onefile scraperoog/scrape.py`
|
||||
7. .exe is found under dist/
|
||||
1. install wine
|
||||
2. create venv and `pip install .`
|
||||
3. `wine pyinstaller --paths=venv/lib/python3.10/site-packages/ --collect-submodules=lxml --onefile scraperoog/scrape.py`
|
||||
|
@ -3,4 +3,3 @@ haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?woh
|
||||
data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=
|
||||
von_id = 200
|
||||
bis_id = 300
|
||||
id_fehler = nicht aktiv
|
||||
|
@ -1,9 +1,10 @@
|
||||
[project]
|
||||
name = "scraperoog"
|
||||
version = "0.0.5"
|
||||
version = "0.0.2"
|
||||
dependencies = [
|
||||
"tqdm",
|
||||
"bs4",
|
||||
"lxml",
|
||||
"httpx",
|
||||
"pydantic",
|
||||
]
|
||||
|
@ -1,11 +1,10 @@
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
import locale
|
||||
locale.setlocale(locale.LC_TIME, "German") # dates on that page are German
|
||||
import pickle
|
||||
import platform
|
||||
if platform.system() == "Windows":
|
||||
locale.setlocale(locale.LC_TIME, "German")
|
||||
else:
|
||||
locale.setlocale(locale.LC_TIME, "de_DE.utf_8")
|
||||
from configparser import ConfigParser
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple
|
||||
@ -24,7 +23,6 @@ MAIN_URL = config['Allgemein'].get('haupt_url')
|
||||
DATA_URL = config['Allgemein'].get('data_url')
|
||||
FROM = config['Allgemein'].getint('von_id')
|
||||
TO = config['Allgemein'].getint('bis_id')
|
||||
ID_NONEXISTANT = config['Allgemein'].get('id_fehler')
|
||||
DATEFORMAT = "%Y-%m-%d"
|
||||
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
|
||||
|
||||
@ -83,34 +81,35 @@ def convert_to_datestring(day: str, month: str, year: str) -> datetime:
|
||||
|
||||
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
|
||||
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
||||
if ID_NONEXISTANT in response_data.text:
|
||||
return Entry(index=0)
|
||||
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||
title_soup = BeautifulSoup(response_title.text, "html.parser")
|
||||
apartment = (
|
||||
title_soup.body.header.h2.get_text()
|
||||
.replace("\xa0", " ")
|
||||
.replace("Wohneinheit: ", "")
|
||||
)
|
||||
|
||||
data_soup = BeautifulSoup(response_data.text, "html.parser")
|
||||
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
|
||||
availabilities = []
|
||||
for elm in valid_element:
|
||||
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
|
||||
status = STATUS_MAPPING[elm["class"][0]]
|
||||
date = convert_to_datestring(
|
||||
elm.get_text(strip=True),
|
||||
date_raw.split(" ")[0],
|
||||
date_raw.split(" ")[1],
|
||||
if "Die Darstellung ist derzeit deaktiviert" not in response_data.text:
|
||||
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||
title_soup = BeautifulSoup(response_title.text, "lxml")
|
||||
apartment = (
|
||||
title_soup.body.header.h2.get_text()
|
||||
.replace("\xa0", " ")
|
||||
.replace("Wohneinheit: ", "")
|
||||
)
|
||||
availabilities.append(Availability(date=date, status=status))
|
||||
return Entry(
|
||||
index=index,
|
||||
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
|
||||
wohneinheit=apartment.encode("utf-8"),
|
||||
availabilities=availabilities,
|
||||
)
|
||||
|
||||
data_soup = BeautifulSoup(response_data.text, "lxml")
|
||||
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
|
||||
availabilities = []
|
||||
for elm in valid_element:
|
||||
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
|
||||
status = STATUS_MAPPING[elm["class"][0]]
|
||||
date = convert_to_datestring(
|
||||
elm.get_text(strip=True),
|
||||
date_raw.split(" ")[0],
|
||||
date_raw.split(" ")[1],
|
||||
)
|
||||
availabilities.append(Availability(date=date, status=status))
|
||||
return Entry(
|
||||
index=index,
|
||||
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
|
||||
wohneinheit=apartment.encode("utf-8"),
|
||||
availabilities=availabilities,
|
||||
)
|
||||
else:
|
||||
return Entry(index=0)
|
||||
|
||||
|
||||
async def extract_results() -> None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user