import json import re import time import traceback from urllib.parse import urljoin import pandas as pd import xlwt import xlrd import requests from requests.structures import CaseInsensitiveDict from bs4 import BeautifulSoup import aigenerated import parser import translations import utils import json # Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля FACULTETS = [ "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" ] BASE_URL = "https://www.vstu.ru/" RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep=" session = requests.Session() session.headers = CaseInsensitiveDict( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Referer": "http://dump.vstu.ru/", "Upgrade-Insecure-Requests": "1", "Priority": "u=0, i", "Pragma": "no-cache", "Cache-Control": "no-cach", } ) EXCEL_LINKS = {} filestime = str(round(time.time())) for facultet in FACULTETS: url = RASP_PREFIX + facultet r = session.get(url) print(f"GET {url}") soup = BeautifulSoup(r.text, 'html.parser') excel_pattern = re.compile(r'\.xlsx?$') # Ищем все теги , у которых атрибут href соответствует нашему паттерну excel_tags = soup.find_all('a', href=excel_pattern) excel_links = [tag.get('href') for tag in excel_tags] # Предположим, вы уже получили excel_links из одного из методов выше # excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...] absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links] if facultet not in EXCEL_LINKS.keys(): EXCEL_LINKS[facultet] = set() for excel_url in absolute_links: EXCEL_LINKS[facultet].add(excel_url) print(f"+url {excel_url}") result = {} faileds = [] counter = 0 for facultet in FACULTETS: counter += 1000 print(f"\n\n-- Факультет '{facultet}' --") facultet_urls = EXCEL_LINKS[facultet] for excel_url in facultet_urls: counter += 1 print(f"\n\n-- Ссылка --") print(f"{excel_url}") is_xlsx = excel_url.endswith(".xlsx") try: filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls" + ("x" if is_xlsx else "") aigenerated.download_file_from_url(excel_url, filename) reader = translations.create_reader(filename) parser.LOGGING = False prs = parser.Parser(reader) prs.parse() for group_name in prs.groups.keys(): if group_name in result.keys(): print(f" -- WTF -- Doubled groups -- name: {group_name}") continue gr = result[group_name] = prs.groups[group_name] gr['facultet'] = facultet gr['data_source'] = excel_url.split("/")[-1] gr['parser_debug'] = { "C_COUNTER": counter, "timestime": filestime, "excel_url": excel_url, "reader_info": reader.info(), "filename": filename } print(f"Populates {len(prs.groups)} groups to result") except Exception as e: print(f"Error while {excel_url}") print(e) traceback.print_exc() faileds.append({ "ex": e, "fac": facultet, "url": excel_url }) json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) print("Faileds:") print(faileds) print("Saved to result.json")