work

2025-09-11 16:35:22 +03:00
parent babf491c8e
commit 6920d24a98
6 changed files with 173 additions and 276486 deletions
--- a/main.py
+++ b/main.py
@@ -8,7 +8,7 @@ import xlwt

 import xlrd
 import requests
-from requests.structures import CaseInsensitiveDict
+

 from bs4 import BeautifulSoup
 import aigenerated
@@ -16,75 +16,28 @@ import parser
 import translations
 import utils
 import json
+import links_parser
 # Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля

 FACULTETS = [
    "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
 ]
-BASE_URL = "https://www.vstu.ru/"
-RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="

-session = requests.Session()
-session.headers = CaseInsensitiveDict(
-    {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-        "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
-        "Accept-Encoding": "gzip, deflate",
-        "Connection": "keep-alive",
-        "Referer": "http://dump.vstu.ru/",
-        "Upgrade-Insecure-Requests": "1",
-        "Priority": "u=0, i",
-        "Pragma": "no-cache",
-        "Cache-Control": "no-cach",
-    }
-)
-
-EXCEL_LINKS = {}
-filestime = str(round(time.time()))
-for facultet in FACULTETS:
-    url = RASP_PREFIX + facultet
-    r = session.get(url)
-    print(f"GET {url}")
-    soup = BeautifulSoup(r.text, 'html.parser')
-    excel_pattern = re.compile(r'\.xlsx?$')
-
-    # Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
-    excel_tags = soup.find_all('a', href=excel_pattern)
-    excel_links = [tag.get('href') for tag in excel_tags]
-
-    # Предположим, вы уже получили excel_links из одного из методов выше
-    # excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
-
-    absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
-
-    if facultet not in EXCEL_LINKS.keys():
-        EXCEL_LINKS[facultet] = set()
-
-    for excel_url in absolute_links:
-        EXCEL_LINKS[facultet].add(excel_url)
-        print(f"+url {excel_url}")
+DEBUG_ONE_FAC = None #'fevt'


-result = {}
-faileds = []
-counter = 0
-for facultet in FACULTETS:
-    counter += 1000
-    print(f"\n\n-- Факультет '{facultet}' --")
-    facultet_urls = EXCEL_LINKS[facultet]
-    for excel_url in facultet_urls:
-        counter += 1
-        print(f"\n\n-- Ссылка --")
-        print(f"{excel_url}")
-        is_xlsx = excel_url.endswith(".xlsx")
+def process_excel_file(facultet, excel_url, counter, timeid):
+    is_xlsx = excel_url.endswith(".xlsx")
+    try:
+        filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
+        aigenerated.download_file_from_url(excel_url, filename)

-        try:
-            filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
-            aigenerated.download_file_from_url(excel_url, filename)
-
-            reader = translations.create_reader(filename)
+        reader = translations.create_reader(filename)
+        print("Reader info")
+        print(reader.info())

+        while True:
+            print(f"Parsing sheet №{reader.get_sheet_index()+1}")
            parser.LOGGING = False
            prs = parser.Parser(reader)
            prs.parse()
@@ -92,33 +45,67 @@ for facultet in FACULTETS:
                if group_name in result.keys():
                    print(f" -- WTF -- Doubled groups -- name: {group_name}")
                    continue
-                
+                        
                gr = result[group_name] = prs.groups[group_name]
                gr['facultet'] = facultet
                gr['data_source'] = excel_url.split("/")[-1]
                gr['parser_debug'] = {
                    "C_COUNTER": counter,
-                    "timestime": filestime,
+                    "timeid": timeid,
                    "excel_url": excel_url,
                    "reader_info": reader.info(),
+                    "reader_sheet_index": reader.get_sheet_index(),
                    "filename": filename
                }

-            print(f"Populates {len(prs.groups)} groups to result")
+            print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))

-        except Exception as e:
-            print(f"Error while {excel_url}")
-            print(e)
-            traceback.print_exc()
-            faileds.append({
-                "ex": e,
-                "fac": facultet,
-                "url": excel_url
-            })
+            if not reader.has_next_sheet():
+                print("File ended")
+                break
+            else:
+                reader.next_sheet()
+                print("Next sheet!")

-json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
+    except Exception as e:
+        print(f"Error while {excel_url}")
+        print(e)
+        traceback.print_exc()
+        faileds.append({
+            "ex": e,
+            "fac": facultet,
+            "url": excel_url
+        })

-print("Faileds:")
-print(faileds)

-print("Saved to result.json")
+result = {}
+faileds = []
+def main():
+    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
+
+    counter = 0
+    timeid = str(round(time.time()))
+    for facultet in EXCEL_LINKS.keys():
+        counter += 1000
+        print(f"\n\n-- Факультет '{facultet}' --")
+        facultet_urls = EXCEL_LINKS[facultet]
+        for excel_url in facultet_urls:
+            counter += 1
+            print(f"\n\n-- Ссылка --")
+            print(f"{excel_url}")
+            
+            print("Start processing excel file")
+            process_excel_file(facultet, excel_url, counter, timeid)
+            print("Excel file processing done!")
+
+    print("Saving result.json")
+    json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
+    print("Saved to result.json")
+
+    print("Faileds:")
+    print(faileds)
+
+
+if __name__ == "__main__":
+    main()
+    print("Bye!")