work
This commit is contained in:
53
links_parser.py
Normal file
53
links_parser.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
BASE_URL = "https://www.vstu.ru/"
|
||||
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
||||
|
||||
|
||||
def parse_links(facultets):
|
||||
session = requests.Session()
|
||||
session.headers = CaseInsensitiveDict(
|
||||
{
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Referer": "http://dump.vstu.ru/",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Priority": "u=0, i",
|
||||
"Pragma": "no-cache",
|
||||
"Cache-Control": "no-cach",
|
||||
}
|
||||
)
|
||||
|
||||
EXCEL_LINKS = {}
|
||||
for facultet in facultets:
|
||||
url = RASP_PREFIX + facultet
|
||||
r = session.get(url)
|
||||
print(f"GET {url}")
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
excel_pattern = re.compile(r'\.xlsx?$')
|
||||
|
||||
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
||||
excel_tags = soup.find_all('a', href=excel_pattern)
|
||||
excel_links = [tag.get('href') for tag in excel_tags]
|
||||
|
||||
# Предположим, вы уже получили excel_links из одного из методов выше
|
||||
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
|
||||
|
||||
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
|
||||
|
||||
if facultet not in EXCEL_LINKS.keys():
|
||||
EXCEL_LINKS[facultet] = set()
|
||||
|
||||
for excel_url in absolute_links:
|
||||
EXCEL_LINKS[facultet].add(excel_url)
|
||||
print(f"+url {excel_url}")
|
||||
|
||||
return EXCEL_LINKS
|
||||
Reference in New Issue
Block a user