work

2025-09-11 16:35:22 +03:00
parent babf491c8e
commit 6920d24a98
6 changed files with 173 additions and 276486 deletions
--- a/links_parser.py
+++ b/links_parser.py
@@ -0,0 +1,53 @@
+import re
+import time
+from urllib.parse import urljoin
+import requests
+from requests.structures import CaseInsensitiveDict
+from bs4 import BeautifulSoup
+
+BASE_URL = "https://www.vstu.ru/"
+RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
+
+
+def parse_links(facultets):
+    session = requests.Session()
+    session.headers = CaseInsensitiveDict(
+        {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
+            "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Referer": "http://dump.vstu.ru/",
+            "Upgrade-Insecure-Requests": "1",
+            "Priority": "u=0, i",
+            "Pragma": "no-cache",
+            "Cache-Control": "no-cach",
+        }
+    )
+
+    EXCEL_LINKS = {}
+    for facultet in facultets:
+        url = RASP_PREFIX + facultet
+        r = session.get(url)
+        print(f"GET {url}")
+        soup = BeautifulSoup(r.text, 'html.parser')
+        excel_pattern = re.compile(r'\.xlsx?$')
+
+        # Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
+        excel_tags = soup.find_all('a', href=excel_pattern)
+        excel_links = [tag.get('href') for tag in excel_tags]
+
+        # Предположим, вы уже получили excel_links из одного из методов выше
+        # excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
+
+        absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
+
+        if facultet not in EXCEL_LINKS.keys():
+            EXCEL_LINKS[facultet] = set()
+
+        for excel_url in absolute_links:
+            EXCEL_LINKS[facultet].add(excel_url)
+            print(f"+url {excel_url}")
+    
+    return EXCEL_LINKS