This commit is contained in:
2025-09-11 16:35:22 +03:00
parent babf491c8e
commit 6920d24a98
6 changed files with 173 additions and 276486 deletions

2
.gitignore vendored
View File

@@ -2,3 +2,5 @@
*.xlsx *.xlsx
__pycache__ __pycache__
.idea .idea
result.json
groups.json

File diff suppressed because it is too large Load Diff

53
links_parser.py Normal file
View File

@@ -0,0 +1,53 @@
import re
import time
from urllib.parse import urljoin
import requests
from requests.structures import CaseInsensitiveDict
from bs4 import BeautifulSoup
BASE_URL = "https://www.vstu.ru/"
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
def parse_links(facultets):
session = requests.Session()
session.headers = CaseInsensitiveDict(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Referer": "http://dump.vstu.ru/",
"Upgrade-Insecure-Requests": "1",
"Priority": "u=0, i",
"Pragma": "no-cache",
"Cache-Control": "no-cach",
}
)
EXCEL_LINKS = {}
for facultet in facultets:
url = RASP_PREFIX + facultet
r = session.get(url)
print(f"GET {url}")
soup = BeautifulSoup(r.text, 'html.parser')
excel_pattern = re.compile(r'\.xlsx?$')
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
excel_tags = soup.find_all('a', href=excel_pattern)
excel_links = [tag.get('href') for tag in excel_tags]
# Предположим, вы уже получили excel_links из одного из методов выше
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
if facultet not in EXCEL_LINKS.keys():
EXCEL_LINKS[facultet] = set()
for excel_url in absolute_links:
EXCEL_LINKS[facultet].add(excel_url)
print(f"+url {excel_url}")
return EXCEL_LINKS

137
main.py
View File

@@ -8,7 +8,7 @@ import xlwt
import xlrd import xlrd
import requests import requests
from requests.structures import CaseInsensitiveDict
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import aigenerated import aigenerated
@@ -16,75 +16,28 @@ import parser
import translations import translations
import utils import utils
import json import json
import links_parser
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля # Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
FACULTETS = [ FACULTETS = [
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
] ]
BASE_URL = "https://www.vstu.ru/"
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
session = requests.Session() DEBUG_ONE_FAC = None #'fevt'
session.headers = CaseInsensitiveDict(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Referer": "http://dump.vstu.ru/",
"Upgrade-Insecure-Requests": "1",
"Priority": "u=0, i",
"Pragma": "no-cache",
"Cache-Control": "no-cach",
}
)
EXCEL_LINKS = {}
filestime = str(round(time.time()))
for facultet in FACULTETS:
url = RASP_PREFIX + facultet
r = session.get(url)
print(f"GET {url}")
soup = BeautifulSoup(r.text, 'html.parser')
excel_pattern = re.compile(r'\.xlsx?$')
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
excel_tags = soup.find_all('a', href=excel_pattern)
excel_links = [tag.get('href') for tag in excel_tags]
# Предположим, вы уже получили excel_links из одного из методов выше
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
if facultet not in EXCEL_LINKS.keys():
EXCEL_LINKS[facultet] = set()
for excel_url in absolute_links:
EXCEL_LINKS[facultet].add(excel_url)
print(f"+url {excel_url}")
result = {} def process_excel_file(facultet, excel_url, counter, timeid):
faileds = [] is_xlsx = excel_url.endswith(".xlsx")
counter = 0 try:
for facultet in FACULTETS: filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
counter += 1000 aigenerated.download_file_from_url(excel_url, filename)
print(f"\n\n-- Факультет '{facultet}' --")
facultet_urls = EXCEL_LINKS[facultet]
for excel_url in facultet_urls:
counter += 1
print(f"\n\n-- Ссылка --")
print(f"{excel_url}")
is_xlsx = excel_url.endswith(".xlsx")
try: reader = translations.create_reader(filename)
filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls" + ("x" if is_xlsx else "") print("Reader info")
aigenerated.download_file_from_url(excel_url, filename) print(reader.info())
reader = translations.create_reader(filename)
while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
parser.LOGGING = False parser.LOGGING = False
prs = parser.Parser(reader) prs = parser.Parser(reader)
prs.parse() prs.parse()
@@ -98,27 +51,61 @@ for facultet in FACULTETS:
gr['data_source'] = excel_url.split("/")[-1] gr['data_source'] = excel_url.split("/")[-1]
gr['parser_debug'] = { gr['parser_debug'] = {
"C_COUNTER": counter, "C_COUNTER": counter,
"timestime": filestime, "timeid": timeid,
"excel_url": excel_url, "excel_url": excel_url,
"reader_info": reader.info(), "reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"filename": filename "filename": filename
} }
print(f"Populates {len(prs.groups)} groups to result") print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
except Exception as e: if not reader.has_next_sheet():
print(f"Error while {excel_url}") print("File ended")
print(e) break
traceback.print_exc() else:
faileds.append({ reader.next_sheet()
"ex": e, print("Next sheet!")
"fac": facultet,
"url": excel_url
})
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
print("Faileds:")
print(faileds)
print("Saved to result.json") result = {}
faileds = []
def main():
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
counter = 0
timeid = str(round(time.time()))
for facultet in EXCEL_LINKS.keys():
counter += 1000
print(f"\n\n-- Факультет '{facultet}' --")
facultet_urls = EXCEL_LINKS[facultet]
for excel_url in facultet_urls:
counter += 1
print(f"\n\n-- Ссылка --")
print(f"{excel_url}")
print("Start processing excel file")
process_excel_file(facultet, excel_url, counter, timeid)
print("Excel file processing done!")
print("Saving result.json")
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json")
print("Faileds:")
print(faileds)
if __name__ == "__main__":
main()
print("Bye!")

270254
result.json

File diff suppressed because it is too large Load Diff

View File

@@ -15,7 +15,7 @@ class TranschendentnostCell:
self._is_empty = is_empty self._is_empty = is_empty
def is_empty(self): def is_empty(self):
self._is_empty return self._is_empty
class ExcelSheetReader(ABC): class ExcelSheetReader(ABC):
""" """
@@ -25,6 +25,18 @@ class ExcelSheetReader(ABC):
def __init__(self, file_path): def __init__(self, file_path):
self.file_path = file_path self.file_path = file_path
@abstractmethod
def get_sheet_index(self):
pass
@abstractmethod
def has_next_sheet(self):
pass
@abstractmethod
def next_sheet(self):
pass
@abstractmethod @abstractmethod
def get_cell_value(self, row, col): def get_cell_value(self, row, col):
"""Возвращает значение ячейки по 0-индексированным координатам.""" """Возвращает значение ячейки по 0-индексированным координатам."""
@@ -96,8 +108,23 @@ class ExcelSheetReader(ABC):
class XlrdSheetReader(ExcelSheetReader): class XlrdSheetReader(ExcelSheetReader):
def __init__(self, file_path, sheet_index=0): def __init__(self, file_path, sheet_index=0):
super().__init__(file_path) super().__init__(file_path)
self.sheet_index = sheet_index
self.book = xlrd.open_workbook(file_path, formatting_info=True) self.book = xlrd.open_workbook(file_path, formatting_info=True)
self.sheet = self.book.sheet_by_index(sheet_index) self.init_sheet()
def get_sheet_index(self):
return self.sheet_index
def init_sheet(self):
self.sheet = self.book.sheet_by_index(self.sheet_index)
def has_next_sheet(self):
return self.sheet_index < len(self.book.sheet_names())-1
def next_sheet(self):
if self.has_next_sheet():
self.sheet_index += 1
self.init_sheet()
def get_cell_value(self, row, col): def get_cell_value(self, row, col):
# Проверка на выход за пределы таблицы, чтобы избежать ошибок # Проверка на выход за пределы таблицы, чтобы избежать ошибок
@@ -106,9 +133,9 @@ class XlrdSheetReader(ExcelSheetReader):
return None return None
def info(self): def info(self):
print("The number of worksheets is {0}".format(self.book.nsheets)) return """[XLRD (.xls)] The number of worksheets is {0}
print("Worksheet name(s): {0}".format(self.book.sheet_names())) Worksheet name(s): {1}
return "'{0}': size: {1}x{2} names: ".format(self.sheet.name, self.sheet.nrows, self.sheet.ncols, " ".join(self.book.sheet_names())) '{2}': size: {3}x{4}""".format(self.book.nsheets, self.book.sheet_names(), self.sheet.name, self.sheet.nrows, self.sheet.ncols)
def cell(self, row, col): def cell(self, row, col):
"""Возвращает абстрактную клетку""" """Возвращает абстрактную клетку"""
@@ -150,8 +177,9 @@ class XlrdSheetReader(ExcelSheetReader):
class OpenpyxlSheetReader(ExcelSheetReader): class OpenpyxlSheetReader(ExcelSheetReader):
def __init__(self, file_path, sheet_name=None): def __init__(self, file_path, sheet_name=None):
super().__init__(file_path) super().__init__(file_path)
self.sheet_index = 0
self.workbook = openpyxl.load_workbook(file_path, data_only=True) self.workbook = openpyxl.load_workbook(file_path, data_only=True)
self.sheet = self.workbook[sheet_name] if sheet_name else self.workbook.active self.init_sheet()
# Словарь для трансляции стилей границ openpyxl в числовые коды xlrd # Словарь для трансляции стилей границ openpyxl в числовые коды xlrd
self.BORDER_STYLE_MAP = { self.BORDER_STYLE_MAP = {
@@ -161,6 +189,26 @@ class OpenpyxlSheetReader(ExcelSheetReader):
'slantDashDot': 13 'slantDashDot': 13
} }
def get_sheet_index(self):
return self.sheet_index
def has_next_sheet(self):
return self.sheet_index < len(self.workbook.sheetnames)-1
def next_sheet(self):
if self.has_next_sheet():
self.sheet_index += 1
self.init_sheet()
def init_sheet(self):
self.sheet = self.workbook[self.workbook.sheetnames[self.sheet_index]]
def info(self):
return """[OpenPyXL (.xlsx)] The number of worksheets is {0}
Worksheet name(s): {1}
'{2}': size: {3}x{4}""".format(len(self.workbook.sheetnames), self.workbook.sheetnames, self.sheet, self.sheet.max_row, self.sheet.max_column)
def _get_cell(self, row, col): def _get_cell(self, row, col):
"""Внутренний метод для получения ячейки с преобразованием координат.""" """Внутренний метод для получения ячейки с преобразованием координат."""
# openpyxl использует 1-индексированную систему # openpyxl использует 1-индексированную систему