fixes 3 pairs подряд, etc

This commit is contained in:
2025-09-12 20:07:04 +03:00
parent 6920d24a98
commit ed65e5b483
8 changed files with 239 additions and 78 deletions

View File

@@ -1,3 +1,6 @@
# Copyright GEMINI
import re
# --- Ресурсы для алгоритма ---

View File

@@ -1,3 +1,4 @@
# Copyright Stanislav Mironov
class Coord:
@@ -13,6 +14,10 @@ class Coord:
return Coord(self.row if row is None else row,
self.col if col is None else col)
def copy(self) -> "Coord":
return Coord(self.row,
self.col)
def cell(self, reader: "ExcelSheetReader") -> "TranschendentnostCell":
return reader.cell(self.row, self.col)

Binary file not shown.

View File

@@ -1,5 +1,7 @@
# Copyright Stanislav Mironov
import re
import time
from urllib.parse import urljoin
import requests
from requests.structures import CaseInsensitiveDict
@@ -8,7 +10,7 @@ from bs4 import BeautifulSoup
BASE_URL = "https://www.vstu.ru/"
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их
def parse_links(facultets):
session = requests.Session()
session.headers = CaseInsensitiveDict(
@@ -18,17 +20,17 @@ def parse_links(facultets):
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Referer": "http://dump.vstu.ru/",
"Upgrade-Insecure-Requests": "1",
"Priority": "u=0, i",
"Pragma": "no-cache",
"Cache-Control": "no-cach",
"Cache-Control": "no-cach"
}
)
EXCEL_LINKS = {}
for facultet in facultets:
url = RASP_PREFIX + facultet
print("getting...")
r = session.get(url)
print(f"GET {url}")
soup = BeautifulSoup(r.text, 'html.parser')
@@ -51,3 +53,4 @@ def parse_links(facultets):
print(f"+url {excel_url}")
return EXCEL_LINKS

136
main.py
View File

@@ -1,62 +1,127 @@
# Copyright Stanislav Mironov
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import json
import re
import os
import time
import traceback
from urllib.parse import urljoin
import pandas as pd
import xlwt
import xlrd
import requests
from bs4 import BeautifulSoup
import uuid
import aigenerated
import parser
import translations
import utils
import json
import links_parser
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import shutil
def currt():
return round(time.time())
FACULTETS = [
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]
DIRNAME = "excels"
DEBUG_ONE_FAC = None #'fevt'
DEBUG_ONE_FAC = None #'htf'
result_groups = {}
result = {
"version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав",
"actual_at": round(time.time()),
"documentation": "TODO",
"daypicture": "QwQ",
"university": "VSTU",
"university_site": "https://www.vstu.ru/",
"stat": {
"total_parsing_time": -1,
},
"api_notices": {
"updated_at": 1757688552,
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;",
"warning": False,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
},
"doubled_groups": [],
"debug": {
"bleu~~": 1
},
"excels": [],
"facultets": FACULTETS,
"emptykey1": "",
"emptykey2": "",
"groups": result_groups,
"emptykey3": "",
"emptykey4": "",
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
}
def process_excel_file(facultet, excel_url, counter, timeid):
is_xlsx = excel_url.endswith(".xlsx")
filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_info = {
"filename": excel_url.split("/")[-1],
"url": excel_url,
"download_place": filename,
"stat": {
"download": -1,
"create_reader": -1,
"parse": -1,
"cycles": 0
},
"group_names_parsed": [],
"facultet": facultet,
"counter": counter
}
parser.LOGGING = False
try:
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
t = utils.StepTimeCounter()
aigenerated.download_file_from_url(excel_url, filename)
excel_info["stat"]['download'] = t.step()
reader = translations.create_reader(filename)
print("Reader info")
print(reader.info())
excel_info["stat"]['create_reader'] = t.step()
while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
parser.LOGGING = False
excel_info['stat']['cycles'] += 1
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
prs = parser.Parser(reader)
prs.parse()
if prs.parser_error is not None:
excel_info["parser_error_cycle_" + excel_info['stat']['cycles']] = prs.parser_error
for group_name in prs.groups.keys():
if group_name in result.keys():
if group_name in result_groups.keys():
print(f" -- WTF -- Doubled groups -- name: {group_name}")
if 'warning_doubled_groups_skip' not in excel_info.keys():
excel_info['warning_doubled_groups_skip'] = []
excel_info['warning_doubled_groups_skip'].append(group_name)
result['doubled_groups'].append(group_name)
continue
gr = result[group_name] = prs.groups[group_name]
gr = result_groups[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_url.split("/")[-1]
gr['parser_debug'] = {
"C_COUNTER": counter,
gr['debug'] = {
"counter": counter,
"timeid": timeid,
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"filename": filename
}
excel_info["group_names_parsed"].append(group_name)
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
@@ -67,22 +132,40 @@ def process_excel_file(facultet, excel_url, counter, timeid):
reader.next_sheet()
print("Next sheet!")
excel_info["stat"]['parse'] = t.step()
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
u = uuid.uuid4()
excel_info['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
result['excels'].append(excel_info)
result = {}
faileds = []
def main():
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
t = utils.StepTimeCounter()
try:
os.mkdir(DIRNAME)
print(f"Directory '{DIRNAME}' created successfully.")
except Exception:
print(f"Directory '{DIRNAME}' already exists.")
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
counter = 0
timeid = str(round(time.time()))
for facultet in EXCEL_LINKS.keys():
@@ -99,13 +182,24 @@ def main():
print("Excel file processing done!")
print("Saving result.json")
result['stat']['total_parsing_time'] = t.step()
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json")
print("Faileds:")
print(faileds)
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
if __name__ == "__main__":
print("Start")
main()
print("Bye!")

100
parser.py
View File

@@ -1,7 +1,11 @@
# Copyright Stanislav Mironov
PAIR_NUMS = [
"1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16"
]
import json
import xlrd
import uuid
import aigenerated
from coord import Coord, Merged
from translations import ExcelSheetReader
@@ -13,13 +17,13 @@ def pprint(*args, **kwargs):
if LOGGING:
print(*args, **kwargs)
class Parser:
def __init__(self, reader: ExcelSheetReader):
self.reader = reader
self.groups = {}
self.teachers = set()
self.places = set()
self.parser_error = None
pprint("Parser created for '{0}'".format(reader.info()))
def parse(self):
@@ -27,6 +31,7 @@ class Parser:
if monday is None:
print(" -- Failed parse! -- ")
print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!")
self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице."
return
head_rx = monday.row - 1 # выше первого понидельника
@@ -59,7 +64,7 @@ class Parser:
# location
location = merged.high.shift(down=1).cell(self.reader).value
return {"loc": str(location), "leader": str(speaker), "name": str(merged.cell(self.reader).value)}
return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()}
def process_group(self, group, monday):
"""
@@ -71,13 +76,13 @@ class Parser:
pprint(group_name)
row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии.
previous_pair = None
while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк
pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low)
pos_right = pos.shift(right=3)
pair_pos = pos.replace(col=5)
weekday_pos = pos.replace(col=4)
merged = self.reader.get_merged_coord(pos)
right_cell = pos_right.cell(self.reader)
merged_cell = merged.cell(self.reader)
cv = merged_cell.value
# В конце (12 пара:>) название группы, можно использовать как якорь
@@ -89,6 +94,16 @@ class Parser:
weekday = utils.unspace(weekday_mr.cell(self.reader).value)
pair_mr = self.reader.get_merged_coord(pair_pos)
pair = utils.unspace(pair_mr.cell(self.reader).value)
fuck_empty_pair_in_excel = pair == ""
previous_dump = previous_pair
if fuck_empty_pair_in_excel:
if previous_pair is None or previous_pair == "":
pair = f"EMPTY_IN_EXCEL_{uuid.uuid4()}"
else:
pair = utils.next_element(PAIR_NUMS, previous_pair)
if pair != "":
previous_pair = pair
skip = 0
if weekday == "":
@@ -99,26 +114,25 @@ class Parser:
row += 1
else:
break
if not skip:
next = 3 # на сколько пыгнуть для следующего шага?
is_empty_lesson = right_cell.is_empty() and merged_cell.is_empty()
dispname = ""
is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего..
parsed_discipline_name = None
parsed_location = None
parsed_leader = None
is_2pair = False
pairs = 1
is_solid = pos_right in merged
parsed_uncotigorized = []
is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.)
if is_empty_lesson:
dispname = "<no lesson>"
if not is_empty_lesson:
may_prepod = merged.low.shift(down=2)
if utils.has_no_bottom_border(self.reader, may_prepod):
next = 6
is_2pair = True
cur = merged.low.shift(down=2)
while utils.has_no_bottom_border(self.reader, cur):
next += 3
pairs += 1
cur = cur.shift(down=3)
if is_wide_maybe_potokoviy:
ret = self.parse_potokoviy(merged)
@@ -127,45 +141,37 @@ class Parser:
parsed_discipline_name = ret['name']
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next))
else:
if (is_solid):
parsed_discipline_name = cv
dispname = cv
dispname += (" SOLD" if is_solid else " SPLIT")
dispname += (" [ДВУПАРНЫЙ]" if is_2pair else "")
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next))
if parsed_leader: dispname += f" [{parsed_leader}]"
if parsed_location: dispname += f" [{parsed_location}]"
dispname = dispname.replace("\n", "\\n")
pprint(f"[{group_name}] row={row}; {pos} {pos_right} {pair} {weekday}: {'[ПОТОКОВЫЙ] ' if is_wide_maybe_potokoviy else ''}{dispname} {parsed_uncotigorized}")
# пытаемся из некотегорезированных данных выцепить место и лидера (препода)
prepods = set()
if parsed_leader is not None: prepods.add(aigenerated.extract_last_name(parsed_leader))
if parsed_leader is not None: prepods.add(parsed_leader.strip())
locations = set()
if parsed_location is not None: locations.add(parsed_location.replace(" ", "").replace("-", ""))
if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", ""))
for x in list(parsed_uncotigorized):
if aigenerated.is_surname_string(x):
prepods.add(aigenerated.extract_last_name(x))
prepods.add(x.strip())
if aigenerated.is_room_number(x):
locations.add(x.replace(" ", "").replace("-", "") if x is not None else None)
locations.add(x.strip().replace(" ", "") if x is not None else None)
# оставшееся в дисциплину (костыль)
# попытка починить пустую дисциплину
if parsed_discipline_name is None:
parsed_discipline_name = " ".join(parsed_uncotigorized)
l = utils.remove_from_list(list(parsed_uncotigorized), [parsed_leader, parsed_location])
parsed_discipline_name = " ".join(l)
prepods.discard(None)
prepods.discard("")
locations.discard(None)
locations.discard("")
# чистим сеты от мусора
utils.discards_list(prepods, nones=True, emptystrings=True)
utils.discards_list(locations, nones=True, emptystrings=True)
utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True)
# если не пустой предмет то записываем его
if not is_empty_lesson:
slots = group['slots']
w = weekday + ("_1" if weeknum == 1 else "_2")
@@ -174,20 +180,28 @@ class Parser:
today = slots[w]
today[pair] = {
"pos": str(pos),
"discipline": parsed_discipline_name,
"excel_pos": str(pos),
"discipline_name": parsed_discipline_name.strip(),
"locations": list(locations),
"leads": list(prepods),
"is_solid": is_solid,
"is_2pair": is_2pair,
"time_coeff": pairs,
"is_flow": is_wide_maybe_potokoviy,
"lefttopmerged": {
"width": merged.width(),
"height": merged.height(),
"excel_range": utils.merged_humanize(merged.as_numbers())
},
"raw": parsed_uncotigorized,
"weeday": utils.weekday_to_num(weekday),
"weekday": utils.weekday_to_num(weekday),
"weeknum": weeknum
}
self.teachers.add(aigenerated.extract_last_name(parsed_leader))
if fuck_empty_pair_in_excel:
today[pair]['pair_num_empty'] = {
"prev": previous_dump,
"restoted": pair != "",
"pair": pair
}
# INCREMENT на next и конец цикла.

View File

@@ -267,12 +267,11 @@ def create_reader(file_path, **kwargs) -> ExcelSheetReader:
Создает и возвращает подходящий экземпляр ридера в зависимости от расширения файла.
"""
if file_path.lower().endswith('.xlsx'):
print("Используется движок openpyxl для .xlsx")
return OpenpyxlSheetReader(file_path, **kwargs)
elif file_path.lower().endswith('.xls'):
print("Используется движок xlrd для .xls")
return XlrdSheetReader(file_path, **kwargs)
else:
raise ValueError("Неподдерживаемый формат файла. Используйте .xls или .xlsx")

View File

@@ -1,14 +1,58 @@
# gemini generated
# Copyright Stanislav Mironov
import time
import xlrd
from coord import Coord, Merged
from translations import ExcelSheetReader
import re
class StepTimeCounter:
def __init__(self):
self.time: float = -1.0
self.createtime = time.time()
self.setnow()
def setnow(self):
self.time = time.time()
def step(self, no_set_now=False):
left = time.time() - self.time
if not no_set_now:
self.setnow()
return left
def from_create(self):
left = time.time() - self.createtime
return left
EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK]
def discards_list(trg, nones=True, emptystrings=True):
if nones: remove_from_list(trg, [None])
if emptystrings: remove_from_list(trg, [""])
def has_no_bottom_border(reader: "ExcelSheetReader", coord):
return reader.get_border_style(coord, 'bottom') == 0 and reader.get_border_style(coord.shift(down=1), 'top') == 0
def find_element_index(my_list, element):
if element in my_list:
return my_list.index(element)
else:
return -1
def next_element(arr, el):
index = find_element_index(arr, el)
return arr[index + 1]
def remove_from_list(l: list, todel: list):
for x in todel:
if x in l:
l.remove(x)
return l
def parse_all_dirt(reader: "ExcelSheetReader", min_pos, right, down):
RET = set()
@@ -17,17 +61,16 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos, right, down):
col = min_pos.col
while col < min_pos.col + right:
#print(excel_coordinate(row, col))
value = str(reader.get_cell_value(row, col))
if value is not None and len(value) > 0:
cv = reader.get_cell_value(row, col)
value = str(cv).strip()
if cv is not None and len(value) > 0:
RET.add(value)
col += 1
row += 1
return RET
import re
# GEMINI
# GEMINI GENERATED
def normalize_name(raw_name):
"""
Приводит разнородные записи ФИО к единому структурированному виду.