1
0
mirror of https://github.com/serega404/VodokanalBot.git synced 2026-05-30 12:10:01 +03:00

Вынес парсер в отдельный файл

This commit is contained in:
2026-05-29 01:03:07 +03:00
parent 23ab8113cf
commit 6f2b27f00e
7 changed files with 186 additions and 105 deletions
+2
View File
@@ -0,0 +1,2 @@
__pycache__/
*.py[cod]
+2 -1
View File
@@ -11,7 +11,8 @@ RUN pip3 install -r requirements.txt
COPY crontab /tmp/crontab COPY crontab /tmp/crontab
RUN cat /tmp/crontab > /etc/crontabs/root RUN cat /tmp/crontab > /etc/crontabs/root
COPY main.py main.py COPY start_telegram.py start_telegram.py
COPY parser.py parser.py
# run crond as main process of container # run crond as main process of container
CMD ["crond", "-f", "-l", "2"] CMD ["crond", "-f", "-l", "2"]
+18
View File
@@ -23,6 +23,24 @@ docker run -d --name VodokanalBot \
docker compose up -d --build docker compose up -d --build
``` ```
## Интеграции
Общая логика парсинга, работы с `data/db.json` и поиска новых сообщений вынесена в [`parser.py`](./parser.py).
Для новой интеграции достаточно создать свой адаптер отправки и передать его в `publish_new_posts`:
``` Python
from parser import create_session, publish_new_posts
session = create_session()
publish_new_posts(
send_message=lambda message: print(message),
session=session,
url="http://www.tgnvoda.ru/avarii.php",
)
```
## Библиотеки ## Библиотеки
* [Requests](https://requests.readthedocs.io/en/latest/) * [Requests](https://requests.readthedocs.io/en/latest/)
+2 -2
View File
@@ -1,2 +1,2 @@
@reboot cd /app && python3 /app/main.py @reboot cd /app && python3 /app/start_telegram.py
*/10 * * * * cd /app && python3 /app/main.py */10 * * * * cd /app && python3 /app/start_telegram.py
-101
View File
@@ -1,101 +0,0 @@
import requests, json, os
from bs4 import BeautifulSoup
from datetime import datetime
# Config
URL = os.environ.get('VODOKANAL_URL', 'http://www.tgnvoda.ru/avarii.php')
SEND_SILENT = os.environ.get('SEND_SILENT', False)
TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN', '')
TELEGRAM_CHANNEL = os.environ.get('TELEGRAM_CHANNEL', '')
PROXY_URL = os.environ.get('PROXY_URL', '')
if TELEGRAM_TOKEN == '':
print("Telegram token is not set")
exit()
if TELEGRAM_CHANNEL == '':
print("Telegram channel is not set")
exit()
# Configure HTTP client
session = requests.Session()
if PROXY_URL != '':
session.proxies.update({
'http': PROXY_URL,
'https': PROXY_URL,
})
# Load database
db = None
if (os.path.isfile('data/db.json')):
with open('data/db.json', 'r', encoding='utf-8') as f:
db = json.load(f)
else:
print("Database not loaded")
# Get data
req = session.get(URL)
if (req.status_code != 200):
print("Request error: " + str(req.status_code))
exit()
soup = BeautifulSoup(req.content, "html.parser")
elements = []
for tag in soup.find_all('font', size='2', face='VERDANA'):
date = tag.select_one('font:nth-of-type(1)').b.text
if not(date.split('.')[0] == str(datetime.today().day).zfill(2) and date.split('.')[1] == str(datetime.today().month).zfill(2)):
continue
elements.append(date + "$" + tag.select_one('font:nth-of-type(2)').text.replace('\n', ''))
if elements == []:
print("No posts")
exit()
print("The number of posts for this day:", len(elements))
# Send telegram message
def send_message(message):
req = session.get(
"https://api.telegram.org/bot" + TELEGRAM_TOKEN + "/sendMessage",
params={
'chat_id': TELEGRAM_CHANNEL,
'disable_notification': str(SEND_SILENT),
'text': message,
},
)
if (req.status_code != 200):
print("Telegram request error: " + str(req.status_code))
exit()
else:
print("Telegram message sent, mess id: " + str(req.json()['result']['message_id']))
# Compare db and elements
if db is not None:
diff = set(elements) - set(db)
if not diff:
print("No new posts")
exit()
for i in diff:
send_message(i.split("$",1)[1])
else:
for element in elements:
send_message(element.split("$",1)[1])
# Save database
if not os.path.exists("data"):
os.makedirs("data")
with open('data/db.json', 'w', encoding='utf-8') as f:
json.dump(elements, f, ensure_ascii=False)
print("Database updated")
+110
View File
@@ -0,0 +1,110 @@
import json
import os
from dataclasses import dataclass
from datetime import datetime
import requests
from bs4 import BeautifulSoup
DEFAULT_DB_PATH = "data/db.json"
@dataclass(frozen=True)
class Post:
date: str
text: str
@property
def key(self):
return self.date + "$" + self.text
def create_session(proxy_url=""):
session = requests.Session()
if proxy_url:
session.proxies.update({
"http": proxy_url,
"https": proxy_url,
})
return session
def load_database(path=DEFAULT_DB_PATH):
if not os.path.isfile(path):
print("Database not loaded")
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_database(posts, path=DEFAULT_DB_PATH):
directory = os.path.dirname(path)
if directory and not os.path.exists(directory):
os.makedirs(directory)
with open(path, "w", encoding="utf-8") as f:
json.dump([post.key for post in posts], f, ensure_ascii=False)
print("Database updated")
def fetch_posts(session, url, today=None):
req = session.get(url)
if req.status_code != 200:
raise RuntimeError("Request error: " + str(req.status_code))
return parse_posts(req.content, today=today)
def parse_posts(content, today=None):
today = today or datetime.today()
soup = BeautifulSoup(content, "html.parser")
posts = []
for tag in soup.find_all("font", size="2", face="VERDANA"):
date = tag.select_one("font:nth-of-type(1)").b.text
if not is_today(date, today):
continue
text = tag.select_one("font:nth-of-type(2)").text.replace("\n", "")
posts.append(Post(date=date, text=text))
return posts
def is_today(date, today):
day, month = date.split(".")[:2]
return day == str(today.day).zfill(2) and month == str(today.month).zfill(2)
def get_new_posts(posts, database):
if database is None:
return posts
database_keys = set(database)
return [post for post in posts if post.key not in database_keys]
def publish_new_posts(send_message, session, url, db_path=DEFAULT_DB_PATH):
database = load_database(db_path)
posts = fetch_posts(session, url)
if not posts:
print("No posts")
return
print("The number of posts for this day:", len(posts))
new_posts = get_new_posts(posts, database)
if not new_posts:
print("No new posts")
return
for post in new_posts:
send_message(post.text)
save_database(posts, db_path)
+51
View File
@@ -0,0 +1,51 @@
import os
from parser import create_session, publish_new_posts
URL = os.environ.get('VODOKANAL_URL', 'http://www.tgnvoda.ru/avarii.php')
SEND_SILENT = os.environ.get('SEND_SILENT', False)
TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN', '')
TELEGRAM_CHANNEL = os.environ.get('TELEGRAM_CHANNEL', '')
PROXY_URL = os.environ.get('PROXY_URL', '')
def send_telegram_message(session, message):
req = session.get(
"https://api.telegram.org/bot" + TELEGRAM_TOKEN + "/sendMessage",
params={
'chat_id': TELEGRAM_CHANNEL,
'disable_notification': str(SEND_SILENT),
'text': message,
},
)
if (req.status_code != 200):
print("Telegram request error: " + str(req.status_code))
exit()
else:
print("Telegram message sent, mess id: " + str(req.json()['result']['message_id']))
def main():
if TELEGRAM_TOKEN == '':
print("Telegram token is not set")
exit()
if TELEGRAM_CHANNEL == '':
print("Telegram channel is not set")
exit()
session = create_session(PROXY_URL)
try:
publish_new_posts(
send_message=lambda message: send_telegram_message(session, message),
session=session,
url=URL,
)
except RuntimeError as error:
print(error)
exit()
if __name__ == "__main__":
main()