fixes 3 pairs подряд, etc

2025-09-12 20:07:04 +03:00
parent 6920d24a98
commit ed65e5b483
8 changed files with 239 additions and 78 deletions
--- a/main.py
+++ b/main.py
@@ -1,62 +1,127 @@
+# Copyright Stanislav Mironov
+
+# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
+
+
 import json
-import re
+import os
 import time
 import traceback
-from urllib.parse import urljoin
-import pandas as pd
-import xlwt
-
-import xlrd
-import requests
-
-
-from bs4 import BeautifulSoup
+import uuid
 import aigenerated
 import parser
 import translations
 import utils
 import json
 import links_parser
-# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
+import shutil
+
+def currt():
+    return round(time.time())

 FACULTETS = [
    "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
 ]
+DIRNAME = "excels"

-DEBUG_ONE_FAC = None #'fevt'
+DEBUG_ONE_FAC = None #'htf'
+result_groups = {}
+result = {
+    "version": 1,
+    "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав",
+    "actual_at": round(time.time()),
+    "documentation": "TODO",
+    "daypicture": "QwQ",
+    "university": "VSTU",
+    "university_site": "https://www.vstu.ru/",
+    "stat": {
+        "total_parsing_time": -1,
+    },
+    "api_notices": {
+        "updated_at": 1757688552,
+        "text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n  ~fazziclay aka Stanislav;",
+        "warning": False,
+        "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
+    },
+    "doubled_groups": [],
+    "debug": {
+        "bleu~~": 1
+    },
+    "excels": [],
+    "facultets": FACULTETS,

+    "emptykey1": "",
+    "emptykey2": "",
+
+    "groups": result_groups,
+
+    "emptykey3": "",
+    "emptykey4": "",
+    "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
+}

 def process_excel_file(facultet, excel_url, counter, timeid):
    is_xlsx = excel_url.endswith(".xlsx")
+    filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
+
+    excel_info = {
+        "filename": excel_url.split("/")[-1],
+        "url": excel_url,
+        "download_place": filename,
+        "stat": {
+            "download": -1,
+            "create_reader": -1,
+            "parse": -1,
+            "cycles": 0
+        },
+        "group_names_parsed": [],
+        "facultet": facultet,
+        "counter": counter
+    }
+    parser.LOGGING = False
+
    try:
-        filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
+        t = utils.StepTimeCounter()
        aigenerated.download_file_from_url(excel_url, filename)
+        excel_info["stat"]['download'] = t.step()

        reader = translations.create_reader(filename)
        print("Reader info")
        print(reader.info())
+        excel_info["stat"]['create_reader'] = t.step()

        while True:
-            print(f"Parsing sheet №{reader.get_sheet_index()+1}")
-            parser.LOGGING = False
+            excel_info['stat']['cycles'] += 1
+            print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
            prs = parser.Parser(reader)
            prs.parse()
+            if prs.parser_error is not None:
+                excel_info["parser_error_cycle_" + excel_info['stat']['cycles']] = prs.parser_error
+
            for group_name in prs.groups.keys():
-                if group_name in result.keys():
+                if group_name in result_groups.keys():
                    print(f" -- WTF -- Doubled groups -- name: {group_name}")
+                    if 'warning_doubled_groups_skip' not in excel_info.keys():
+                        excel_info['warning_doubled_groups_skip'] = []
+                    
+                    excel_info['warning_doubled_groups_skip'].append(group_name)
+                    result['doubled_groups'].append(group_name)
+
+
                    continue
                        
-                gr = result[group_name] = prs.groups[group_name]
+                gr = result_groups[group_name] = prs.groups[group_name]
                gr['facultet'] = facultet
                gr['data_source'] = excel_url.split("/")[-1]
-                gr['parser_debug'] = {
-                    "C_COUNTER": counter,
+                gr['debug'] = {
+                    "counter": counter,
                    "timeid": timeid,
                    "excel_url": excel_url,
                    "reader_info": reader.info(),
                    "reader_sheet_index": reader.get_sheet_index(),
                    "filename": filename
                }
+                excel_info["group_names_parsed"].append(group_name)

            print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))

@@ -67,22 +132,40 @@ def process_excel_file(facultet, excel_url, counter, timeid):
                reader.next_sheet()
                print("Next sheet!")

+        excel_info["stat"]['parse'] = t.step()
+
+        
    except Exception as e:
        print(f"Error while {excel_url}")
        print(e)
        traceback.print_exc()
+        u = uuid.uuid4()
+        excel_info['error'] = {
+            "smile": ":(",
+            "error_message": str(e),
+            "log_anchor": str(u),
+            "time": currt()
+        }
+        print(f"Log Anchor: {u}")
        faileds.append({
            "ex": e,
            "fac": facultet,
            "url": excel_url
        })
+    
+    result['excels'].append(excel_info)

-
-result = {}
 faileds = []
 def main():
-    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
+    t = utils.StepTimeCounter()
+    try:
+        os.mkdir(DIRNAME)
+        print(f"Directory '{DIRNAME}' created successfully.")
+    except Exception:
+        print(f"Directory '{DIRNAME}' already exists.")

+    print("main(); parse links starting...")
+    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
    counter = 0
    timeid = str(round(time.time()))
    for facultet in EXCEL_LINKS.keys():
@@ -99,13 +182,24 @@ def main():
            print("Excel file processing done!")

    print("Saving result.json")
+
+    result['stat']['total_parsing_time'] = t.step()
+
    json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
    print("Saved to result.json")

    print("Faileds:")
    print(faileds)

+        # Delete a non-empty directory and its contents
+    try:
+        shutil.rmtree(DIRNAME)
+        print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
+    except Exception as e:
+        print(f"Error deleting directory '{DIRNAME}': {e}")
+

 if __name__ == "__main__":
+    print("Start")
    main()
    print("Bye!")