59 lines
2.1 KiB
Python
59 lines
2.1 KiB
Python
# Copyright Stanislav Mironov
|
||
|
||
|
||
import re
|
||
from urllib.parse import urljoin
|
||
import requests
|
||
from requests.structures import CaseInsensitiveDict
|
||
from bs4 import BeautifulSoup
|
||
|
||
BASE_URL = "https://www.vstu.ru/"
|
||
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
||
|
||
def sibling_clear_to_date(s: str):
|
||
if s is None:
|
||
return "!!!Python None!!!"
|
||
return s.lower().replace("(последнее изменение:", "").replace(")", "").strip()
|
||
|
||
# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их
|
||
def parse_links(facultets):
|
||
session = requests.Session()
|
||
session.headers = CaseInsensitiveDict(
|
||
{
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Priority": "u=0, i",
|
||
"Pragma": "no-cache",
|
||
"Cache-Control": "no-cach"
|
||
}
|
||
)
|
||
|
||
EXCEL_LINKS = []
|
||
for facultet in facultets:
|
||
url = RASP_PREFIX + facultet
|
||
print("getting...")
|
||
r = session.get(url)
|
||
print(f"GET {url}")
|
||
soup = BeautifulSoup(r.text, 'html.parser')
|
||
excel_pattern = re.compile(r'\.xlsx?$')
|
||
|
||
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
||
excel_tags = soup.find_all('a', href=excel_pattern)
|
||
for a in excel_tags:
|
||
last_changed = sibling_clear_to_date(a.next_sibling)
|
||
url = urljoin(BASE_URL, a.get('href'))
|
||
record = {
|
||
"facultet": facultet,
|
||
"url": url,
|
||
"last_changed": last_changed
|
||
}
|
||
print("Found in vstu.ru: ", record)
|
||
EXCEL_LINKS.append(record)
|
||
|
||
return sorted(EXCEL_LINKS, key=lambda x: x['url'])
|
||
|