work copy

This commit is contained in:
2025-09-11 14:16:38 +03:00
parent b62640e39b
commit 414907a929
8 changed files with 88146 additions and 664 deletions

141
main.py
View File

@@ -1,21 +1,146 @@
import json
import re
import time
import traceback
from urllib.parse import urljoin
import pandas as pd
import xlwt
import xlrd
import requests
from requests.structures import CaseInsensitiveDict
from bs4 import BeautifulSoup
import aigenerated
import parser
import utils
import json
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
FACULTETS = [
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]
BASE_URL = "https://www.vstu.ru/"
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
book = xlrd.open_workbook("ОН_ФЭВТ_2 курс.xls", formatting_info=True)
print("The number of worksheets is {0}".format(book.nsheets))
print("Worksheet name(s): {0}".format(book.sheet_names()))
sh = book.sheet_by_index(0)
session = requests.Session()
session.headers = CaseInsensitiveDict(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Referer": "http://dump.vstu.ru/",
"Upgrade-Insecure-Requests": "1",
"Priority": "u=0, i",
"Pragma": "no-cache",
"Cache-Control": "no-cach",
}
)
EXCEL_LINKS = {}
filestime = str(round(time.time()))
for facultet in FACULTETS:
url = RASP_PREFIX + facultet
r = session.get(url)
print(f"GET {url}")
soup = BeautifulSoup(r.text, 'html.parser')
excel_pattern = re.compile(r'\.xlsx?$')
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
excel_tags = soup.find_all('a', href=excel_pattern)
excel_links = [tag.get('href') for tag in excel_tags]
# Предположим, вы уже получили excel_links из одного из методов выше
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
if facultet not in EXCEL_LINKS.keys():
EXCEL_LINKS[facultet] = set()
for excel_url in absolute_links:
EXCEL_LINKS[facultet].add(excel_url)
print(f"+url {excel_url}")
prs = parser.Parser(sh)
prs.parse()
result = {}
faileds = []
counter = 0
for facultet in FACULTETS:
counter += 1000
print(f"\n\n-- Факультет '{facultet}' --")
facultet_urls = EXCEL_LINKS[facultet]
for excel_url in facultet_urls:
counter += 1
print(f"\n\n-- Ссылка --")
print(f"{excel_url}")
xlsx = excel_url.endswith(".xlsx")
json.dump(prs.groups, open('groups.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to groups.json")
try:
filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls"
# Download a file
if not xlsx:
aigenerated.download_file_from_url(excel_url, filename)
else:
aigenerated.download_file_from_url(excel_url, filename+"x")
excel_file = pd.ExcelFile(filename + "x")
# Создаем "писателя" для формата .xls с помощью движка xlwt
# Использование 'with' гарантирует, что файл будет корректно сохранен и закрыт
with pd.ExcelWriter(filename, engine='xlwt') as writer:
print("Начинаю конвертацию...")
# Проходим по каждому листу в исходном файле
for sheet_name in excel_file.sheet_names:
print(f" - Обрабатываю лист: {sheet_name}")
# Читаем лист в DataFrame
df = excel_file.parse(sheet_name)
# Записываем этот DataFrame в новый .xls файл с тем же именем листа
# index=False чтобы не добавлять лишнюю колонку с индексами pandas
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"✅ Успешно! Файл конвертирован как: {filename}")
book = xlrd.open_workbook(filename, formatting_info=True)
print("The number of worksheets is {0}".format(book.nsheets))
print("Worksheet name(s): {0}".format(book.sheet_names()))
sh = book.sheet_by_index(0)
parser.LOGGING = False
prs = parser.Parser(sh)
prs.parse()
for group_name in prs.groups.keys():
if group_name in result.keys():
print(f" -- WTF -- Doubled groups -- name: {group_name}")
continue
gr = result[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_url.split("/")[-1] + " SHEET: " + str(sh.name)
gr['parser_debug'] = {
"C_COUNTER": counter,
"timestime": filestime,
"excel_url": excel_url,
"sheet": sh.name,
"filename": filename
}
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Faileds:")
print(faileds)
print("Saved to result.json")