# Copyright Stanislav Mironov import random import re import time from urllib.parse import urljoin import requests from requests.structures import CaseInsensitiveDict from bs4 import BeautifulSoup BASE_URL = "https://www.vstu.ru/" RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep=" def sibling_clear_to_date(s: str): if s is None: return "!!!Python None!!!" return s.lower().replace("(последнее изменение:", "").replace(")", "").strip() # Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их def parse_links(facultets): session = requests.Session() session.headers = CaseInsensitiveDict( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Priority": "u=0, i", "Pragma": "no-cache", "Cache-Control": "no-cach" } ) EXCEL_LINKS = [] for facultet in facultets: url = RASP_PREFIX + facultet print("getting...") r = session.get(url) print(f"GET {url}") soup = BeautifulSoup(r.text, 'html.parser') excel_pattern = re.compile(r'\.xlsx?$') # Ищем все теги , у которых атрибут href соответствует нашему паттерну excel_tags = soup.find_all('a', href=excel_pattern) for a in excel_tags: last_changed = sibling_clear_to_date(a.next_sibling) url = urljoin(BASE_URL, a.get('href')) disp = a.decode_contents() record = { "uniqpath": f"vstu.ru/rasp?dep={facultet}/{disp.strip()}", "facultet": facultet, "url": url, "display_filename": disp, "last_changed": last_changed } print("Found in vstu.ru: ", record) EXCEL_LINKS.append(record) st = random.randint(1, 10)/10 print(f"sleep {st}s") time.sleep(st) return sorted(EXCEL_LINKS, key=lambda x: x['url'])