fixes 3 pairs подряд, etc
This commit is contained in:
138
main.py
138
main.py
@@ -1,62 +1,127 @@
|
||||
# Copyright Stanislav Mironov
|
||||
|
||||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||||
|
||||
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from urllib.parse import urljoin
|
||||
import pandas as pd
|
||||
import xlwt
|
||||
|
||||
import xlrd
|
||||
import requests
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import uuid
|
||||
import aigenerated
|
||||
import parser
|
||||
import translations
|
||||
import utils
|
||||
import json
|
||||
import links_parser
|
||||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||||
import shutil
|
||||
|
||||
def currt():
|
||||
return round(time.time())
|
||||
|
||||
FACULTETS = [
|
||||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||||
]
|
||||
DIRNAME = "excels"
|
||||
|
||||
DEBUG_ONE_FAC = None #'fevt'
|
||||
DEBUG_ONE_FAC = None #'htf'
|
||||
result_groups = {}
|
||||
result = {
|
||||
"version": 1,
|
||||
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав",
|
||||
"actual_at": round(time.time()),
|
||||
"documentation": "TODO",
|
||||
"daypicture": "QwQ",
|
||||
"university": "VSTU",
|
||||
"university_site": "https://www.vstu.ru/",
|
||||
"stat": {
|
||||
"total_parsing_time": -1,
|
||||
},
|
||||
"api_notices": {
|
||||
"updated_at": 1757688552,
|
||||
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;",
|
||||
"warning": False,
|
||||
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
|
||||
},
|
||||
"doubled_groups": [],
|
||||
"debug": {
|
||||
"bleu~~": 1
|
||||
},
|
||||
"excels": [],
|
||||
"facultets": FACULTETS,
|
||||
|
||||
"emptykey1": "",
|
||||
"emptykey2": "",
|
||||
|
||||
"groups": result_groups,
|
||||
|
||||
"emptykey3": "",
|
||||
"emptykey4": "",
|
||||
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
|
||||
}
|
||||
|
||||
def process_excel_file(facultet, excel_url, counter, timeid):
|
||||
is_xlsx = excel_url.endswith(".xlsx")
|
||||
filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
||||
|
||||
excel_info = {
|
||||
"filename": excel_url.split("/")[-1],
|
||||
"url": excel_url,
|
||||
"download_place": filename,
|
||||
"stat": {
|
||||
"download": -1,
|
||||
"create_reader": -1,
|
||||
"parse": -1,
|
||||
"cycles": 0
|
||||
},
|
||||
"group_names_parsed": [],
|
||||
"facultet": facultet,
|
||||
"counter": counter
|
||||
}
|
||||
parser.LOGGING = False
|
||||
|
||||
try:
|
||||
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
||||
t = utils.StepTimeCounter()
|
||||
aigenerated.download_file_from_url(excel_url, filename)
|
||||
excel_info["stat"]['download'] = t.step()
|
||||
|
||||
reader = translations.create_reader(filename)
|
||||
print("Reader info")
|
||||
print(reader.info())
|
||||
excel_info["stat"]['create_reader'] = t.step()
|
||||
|
||||
while True:
|
||||
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
|
||||
parser.LOGGING = False
|
||||
excel_info['stat']['cycles'] += 1
|
||||
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
||||
prs = parser.Parser(reader)
|
||||
prs.parse()
|
||||
if prs.parser_error is not None:
|
||||
excel_info["parser_error_cycle_" + excel_info['stat']['cycles']] = prs.parser_error
|
||||
|
||||
for group_name in prs.groups.keys():
|
||||
if group_name in result.keys():
|
||||
if group_name in result_groups.keys():
|
||||
print(f" -- WTF -- Doubled groups -- name: {group_name}")
|
||||
if 'warning_doubled_groups_skip' not in excel_info.keys():
|
||||
excel_info['warning_doubled_groups_skip'] = []
|
||||
|
||||
excel_info['warning_doubled_groups_skip'].append(group_name)
|
||||
result['doubled_groups'].append(group_name)
|
||||
|
||||
|
||||
continue
|
||||
|
||||
gr = result[group_name] = prs.groups[group_name]
|
||||
gr = result_groups[group_name] = prs.groups[group_name]
|
||||
gr['facultet'] = facultet
|
||||
gr['data_source'] = excel_url.split("/")[-1]
|
||||
gr['parser_debug'] = {
|
||||
"C_COUNTER": counter,
|
||||
gr['debug'] = {
|
||||
"counter": counter,
|
||||
"timeid": timeid,
|
||||
"excel_url": excel_url,
|
||||
"reader_info": reader.info(),
|
||||
"reader_sheet_index": reader.get_sheet_index(),
|
||||
"filename": filename
|
||||
}
|
||||
excel_info["group_names_parsed"].append(group_name)
|
||||
|
||||
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
|
||||
|
||||
@@ -67,22 +132,40 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
||||
reader.next_sheet()
|
||||
print("Next sheet!")
|
||||
|
||||
excel_info["stat"]['parse'] = t.step()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error while {excel_url}")
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
u = uuid.uuid4()
|
||||
excel_info['error'] = {
|
||||
"smile": ":(",
|
||||
"error_message": str(e),
|
||||
"log_anchor": str(u),
|
||||
"time": currt()
|
||||
}
|
||||
print(f"Log Anchor: {u}")
|
||||
faileds.append({
|
||||
"ex": e,
|
||||
"fac": facultet,
|
||||
"url": excel_url
|
||||
})
|
||||
|
||||
result['excels'].append(excel_info)
|
||||
|
||||
|
||||
result = {}
|
||||
faileds = []
|
||||
def main():
|
||||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||||
t = utils.StepTimeCounter()
|
||||
try:
|
||||
os.mkdir(DIRNAME)
|
||||
print(f"Directory '{DIRNAME}' created successfully.")
|
||||
except Exception:
|
||||
print(f"Directory '{DIRNAME}' already exists.")
|
||||
|
||||
print("main(); parse links starting...")
|
||||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||||
counter = 0
|
||||
timeid = str(round(time.time()))
|
||||
for facultet in EXCEL_LINKS.keys():
|
||||
@@ -99,13 +182,24 @@ def main():
|
||||
print("Excel file processing done!")
|
||||
|
||||
print("Saving result.json")
|
||||
|
||||
result['stat']['total_parsing_time'] = t.step()
|
||||
|
||||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||||
print("Saved to result.json")
|
||||
|
||||
print("Faileds:")
|
||||
print(faileds)
|
||||
|
||||
# Delete a non-empty directory and its contents
|
||||
try:
|
||||
shutil.rmtree(DIRNAME)
|
||||
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
|
||||
except Exception as e:
|
||||
print(f"Error deleting directory '{DIRNAME}': {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Start")
|
||||
main()
|
||||
print("Bye!")
|
||||
|
||||
Reference in New Issue
Block a user