Files
VSTU_Schedule_Parser/links_parser.py
2026-03-26 00:12:37 +03:00

62 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright Stanislav Mironov
import re
from urllib.parse import urljoin
import requests
from requests.structures import CaseInsensitiveDict
from bs4 import BeautifulSoup
BASE_URL = "https://www.vstu.ru/"
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
def sibling_clear_to_date(s: str):
if s is None:
return "!!!Python None!!!"
return s.lower().replace("(последнее изменение:", "").replace(")", "").strip()
# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их
def parse_links(facultets):
session = requests.Session()
session.headers = CaseInsensitiveDict(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Priority": "u=0, i",
"Pragma": "no-cache",
"Cache-Control": "no-cach"
}
)
EXCEL_LINKS = []
for facultet in facultets:
url = RASP_PREFIX + facultet
print("getting...")
r = session.get(url)
print(f"GET {url}")
soup = BeautifulSoup(r.text, 'html.parser')
excel_pattern = re.compile(r'\.xlsx?$')
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
excel_tags = soup.find_all('a', href=excel_pattern)
for a in excel_tags:
last_changed = sibling_clear_to_date(a.next_sibling)
url = urljoin(BASE_URL, a.get('href'))
disp = a.decode_contents()
record = {
"uniqpath": f"vstu.ru/rasp?dep={facultet}/{disp.strip()}",
"facultet": facultet,
"url": url,
"display_filename": disp,
"last_changed": last_changed
}
print("Found in vstu.ru: ", record)
EXCEL_LINKS.append(record)
return sorted(EXCEL_LINKS, key=lambda x: x['url'])