Economy resources, sorts and latest_changes

This commit is contained in:
2025-09-18 20:05:57 +03:00
parent e04d87b76e
commit 832c2666c3
3 changed files with 60 additions and 51 deletions

71
main.py
View File

@@ -19,10 +19,11 @@ import shutil
def currt():
return round(time.time())
FACULTETS = [
FACULTETS = sorted([
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]
])
DIRNAME = "excels"
DIFFABLE_DATES = "diffable_dates.txt"
DEBUG_ONE_FAC = None #'htf'
result_groups = {}
@@ -63,20 +64,15 @@ result = {
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
}
def process_excel_file(facultet, excel_url, counter, timeid):
def process_excel_file(facultet, excel_url, counter, latest_changed):
is_xlsx = excel_url.endswith(".xlsx")
filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
filename = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_info = {
"filename": excel_url.split("/")[-1],
"url": excel_url,
"latest_changed": latest_changed,
"download_place": filename,
"stat": {
"download": -1,
"create_reader": -1,
"parse": -1,
"cycles": 0
},
"group_names_parsed": [],
"facultet": facultet,
"counter": counter
@@ -84,17 +80,12 @@ def process_excel_file(facultet, excel_url, counter, timeid):
parser.LOGGING = False
try:
t = utils.StepTimeCounter()
aigenerated.download_file_from_url(excel_url, filename)
excel_info["stat"]['download'] = t.step()
reader = translations.create_reader(filename)
print("Reader info")
print(reader.info())
excel_info["stat"]['create_reader'] = t.step()
while True:
excel_info['stat']['cycles'] += 1
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
prs = parser.Parser(reader)
@@ -103,7 +94,7 @@ def process_excel_file(facultet, excel_url, counter, timeid):
print("parsed done!")
if prs.parser_error is not None:
excel_info["parser_error_cycle_" + str(excel_info['stat']['cycles'])] = prs.parser_error
excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error
for group_name in prs.groups.keys():
if group_name in result_groups.keys():
@@ -121,8 +112,6 @@ def process_excel_file(facultet, excel_url, counter, timeid):
gr['facultet'] = facultet
gr['data_source'] = excel_url.split("/")[-1]
gr['debug'] = {
"counter": counter,
"timeid": timeid,
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
@@ -138,9 +127,6 @@ def process_excel_file(facultet, excel_url, counter, timeid):
else:
reader.next_sheet()
print("Next sheet!")
excel_info["stat"]['parse'] = t.step()
except Exception as e:
print(f"Error while {excel_url}")
@@ -164,6 +150,7 @@ def process_excel_file(facultet, excel_url, counter, timeid):
faileds = []
def main():
global result_groups, result
t = utils.StepTimeCounter()
try:
try:
@@ -179,22 +166,34 @@ def main():
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
counter = 0
timeid = str(round(time.time()))
for facultet in EXCEL_LINKS.keys():
counter += 1000
print(f"\n\n-- Факультет '{facultet}' --")
facultet_urls = EXCEL_LINKS[facultet]
for excel_url in facultet_urls:
counter += 1
print(f"\n\n-- Ссылка --")
print(f"{excel_url}")
print("Start processing excel file")
process_excel_file(facultet, excel_url, counter, timeid)
print("Excel file processing done!")
now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS)
prev_diffable_dates = None
if os.path.exists("diffable_dates.txt"):
with open(DIFFABLE_DATES, 'r') as fp:
prev_diffable_dates = fp.read().strip()
with open(DIFFABLE_DATES, 'w') as fp:
fp.write(now_diffable_dates)
if now_diffable_dates == prev_diffable_dates:
print("No date changes in vstu.ru website. Stopping")
return
counter = 10000
for excel_link in EXCEL_LINKS:
counter += 1
facultet = excel_link['facultet']
excel_url = excel_link['url']
latest_changed = excel_link['last_changed']
process_excel_file(facultet, excel_url, counter, latest_changed)
print("Saving result.json")
group_names_alphabeticaly = sorted(result_groups.keys())
sorted_groups = {}
for group_name in group_names_alphabeticaly:
sorted_groups[group_name] = result_groups[group_name]
result['groups'] = sorted_groups
result['stat']['total_parsing_time'] = t.step()