import re import time from urllib.parse import urljoin import requests from requests.structures import CaseInsensitiveDict from bs4 import BeautifulSoup BASE_URL = "https://www.vstu.ru/" RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep=" def parse_links(facultets): session = requests.Session() session.headers = CaseInsensitiveDict( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Referer": "http://dump.vstu.ru/", "Upgrade-Insecure-Requests": "1", "Priority": "u=0, i", "Pragma": "no-cache", "Cache-Control": "no-cach", } ) EXCEL_LINKS = {} for facultet in facultets: url = RASP_PREFIX + facultet r = session.get(url) print(f"GET {url}") soup = BeautifulSoup(r.text, 'html.parser') excel_pattern = re.compile(r'\.xlsx?$') # Ищем все теги , у которых атрибут href соответствует нашему паттерну excel_tags = soup.find_all('a', href=excel_pattern) excel_links = [tag.get('href') for tag in excel_tags] # Предположим, вы уже получили excel_links из одного из методов выше # excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...] absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links] if facultet not in EXCEL_LINKS.keys(): EXCEL_LINKS[facultet] = set() for excel_url in absolute_links: EXCEL_LINKS[facultet].add(excel_url) print(f"+url {excel_url}") return EXCEL_LINKS