Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
1bcbc89da5 | |||
4e99a7f87b | |||
d7b07d02b5 | |||
8c6b672d90 | |||
e6be0ab494 | |||
c6d39ee579 |
14
README.md
14
README.md
@ -1,4 +1,12 @@
|
|||||||
# scraperoog
|
# Requirements
|
||||||
|
|
||||||
`apt install python3-pydantic python3-httpx python3-bs4 locales-all`
|
`apt install python3-pydantic python3-httpx python3-bs4 python3-tqdm locales-all && sudo locale-gen`
|
||||||
`sudo locale-gen`
|
|
||||||
|
# Windows build
|
||||||
|
1. Install wine
|
||||||
|
2. `winecfg` and configure windows 10
|
||||||
|
3. Install [python](https://www.python.org/downloads/windows/)
|
||||||
|
4. Install pyinstaller in wine `wine pip install pyinstaller`
|
||||||
|
5. Create venv and `pip install .`
|
||||||
|
6. `wine pyinstaller --paths=venv/lib/python3.10/site-packages/ --onefile scraperoog/scrape.py`
|
||||||
|
7. .exe is found under dist/
|
||||||
|
@ -3,3 +3,4 @@ haupt_url = https://www.spiekeroog-vermieter.de/suche/monatskalenderSite.htm?woh
|
|||||||
data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=
|
data_url = https://www.spiekeroog-vermieter.de/suche/monatskalender.htm?wohnids=
|
||||||
von_id = 200
|
von_id = 200
|
||||||
bis_id = 300
|
bis_id = 300
|
||||||
|
id_fehler = nicht aktiv
|
||||||
|
15
pyproject.toml
Normal file
15
pyproject.toml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
[project]
|
||||||
|
name = "scraperoog"
|
||||||
|
version = "0.0.5"
|
||||||
|
dependencies = [
|
||||||
|
"tqdm",
|
||||||
|
"bs4",
|
||||||
|
"httpx",
|
||||||
|
"pydantic",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.setuptools.packages]
|
||||||
|
find = {}
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
scraperoog = "scraperoog.scrape:main"
|
0
scraperoog/__init__.py
Normal file
0
scraperoog/__init__.py
Normal file
@ -1,9 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import csv
|
import csv
|
||||||
import json
|
|
||||||
import locale
|
import locale
|
||||||
import pickle
|
|
||||||
import platform
|
import platform
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
locale.setlocale(locale.LC_TIME, "German")
|
||||||
|
else:
|
||||||
|
locale.setlocale(locale.LC_TIME, "de_DE.utf_8")
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
@ -22,6 +24,7 @@ MAIN_URL = config['Allgemein'].get('haupt_url')
|
|||||||
DATA_URL = config['Allgemein'].get('data_url')
|
DATA_URL = config['Allgemein'].get('data_url')
|
||||||
FROM = config['Allgemein'].getint('von_id')
|
FROM = config['Allgemein'].getint('von_id')
|
||||||
TO = config['Allgemein'].getint('bis_id')
|
TO = config['Allgemein'].getint('bis_id')
|
||||||
|
ID_NONEXISTANT = config['Allgemein'].get('id_fehler')
|
||||||
DATEFORMAT = "%Y-%m-%d"
|
DATEFORMAT = "%Y-%m-%d"
|
||||||
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
|
STATUS_MAPPING = {"DayF": 0, "DayB": 1, "DayFB": 0.5, "DayBF": 0.5}
|
||||||
|
|
||||||
@ -74,42 +77,40 @@ def generate_csv(result: Result) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def convert_to_datestring(day: str, month: str, year: str) -> datetime:
|
def convert_to_datestring(day: str, month: str, year: str) -> datetime:
|
||||||
locale.setlocale(locale.LC_TIME, "")
|
|
||||||
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
|
date = datetime.strptime(f"{day.zfill(2)} {month} {year}", "%d %B %Y")
|
||||||
return date
|
return date
|
||||||
|
|
||||||
|
|
||||||
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
|
async def request_data(index: int, client: AsyncClient) -> Optional[Entry]:
|
||||||
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
response_data = await client.get(DATA_URL + str(index), timeout=20.0)
|
||||||
if "Fehler aufgetreten" not in response_data.text:
|
if ID_NONEXISTANT in response_data.text:
|
||||||
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
|
||||||
title_soup = BeautifulSoup(response_title.text, "lxml")
|
|
||||||
apartment = (
|
|
||||||
title_soup.body.header.h2.get_text()
|
|
||||||
.replace("\xa0", " ")
|
|
||||||
.replace("Wohneinheit: ", "")
|
|
||||||
)
|
|
||||||
|
|
||||||
data_soup = BeautifulSoup(response_data.text, "lxml")
|
|
||||||
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
|
|
||||||
availabilities = []
|
|
||||||
for elm in valid_element:
|
|
||||||
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
|
|
||||||
status = STATUS_MAPPING[elm["class"][0]]
|
|
||||||
date = convert_to_datestring(
|
|
||||||
elm.get_text(strip=True),
|
|
||||||
date_raw.split(" ")[0],
|
|
||||||
date_raw.split(" ")[1],
|
|
||||||
)
|
|
||||||
availabilities.append(Availability(date=date, status=status))
|
|
||||||
return Entry(
|
|
||||||
index=index,
|
|
||||||
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
|
|
||||||
wohneinheit=apartment.encode("utf-8"),
|
|
||||||
availabilities=availabilities,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return Entry(index=0)
|
return Entry(index=0)
|
||||||
|
response_title = await client.get(MAIN_URL + str(index), timeout=20.0)
|
||||||
|
title_soup = BeautifulSoup(response_title.text, "html.parser")
|
||||||
|
apartment = (
|
||||||
|
title_soup.body.header.h2.get_text()
|
||||||
|
.replace("\xa0", " ")
|
||||||
|
.replace("Wohneinheit: ", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
data_soup = BeautifulSoup(response_data.text, "html.parser")
|
||||||
|
valid_element = data_soup.find_all("td", attrs={"data-daynum": True})
|
||||||
|
availabilities = []
|
||||||
|
for elm in valid_element:
|
||||||
|
date_raw = elm.parent.parent.parent.thead.tr.td.get_text(strip=True)
|
||||||
|
status = STATUS_MAPPING[elm["class"][0]]
|
||||||
|
date = convert_to_datestring(
|
||||||
|
elm.get_text(strip=True),
|
||||||
|
date_raw.split(" ")[0],
|
||||||
|
date_raw.split(" ")[1],
|
||||||
|
)
|
||||||
|
availabilities.append(Availability(date=date, status=status))
|
||||||
|
return Entry(
|
||||||
|
index=index,
|
||||||
|
haus=title_soup.body.header.h1.get_text().encode("utf-8"),
|
||||||
|
wohneinheit=apartment.encode("utf-8"),
|
||||||
|
availabilities=availabilities,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def extract_results() -> None:
|
async def extract_results() -> None:
|
||||||
@ -127,10 +128,12 @@ async def extract_results() -> None:
|
|||||||
file.write(result.json())
|
file.write(result.json())
|
||||||
generate_csv(result)
|
generate_csv(result)
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||||
|
asyncio.run(extract_results())
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# with open("results.json", "r") as file:
|
# with open("results.json", "r") as file:
|
||||||
# result = Result(**json.load(file))
|
# result = Result(**json.load(file))
|
||||||
if platform.system() == "Windows":
|
main()
|
||||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
||||||
asyncio.run(extract_results())
|
|
Loading…
x
Reference in New Issue
Block a user