work
This commit is contained in:
139
main.py
139
main.py
@@ -8,7 +8,7 @@ import xlwt
|
||||
|
||||
import xlrd
|
||||
import requests
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import aigenerated
|
||||
@@ -16,75 +16,28 @@ import parser
|
||||
import translations
|
||||
import utils
|
||||
import json
|
||||
import links_parser
|
||||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||||
|
||||
FACULTETS = [
|
||||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||||
]
|
||||
BASE_URL = "https://www.vstu.ru/"
|
||||
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
||||
|
||||
session = requests.Session()
|
||||
session.headers = CaseInsensitiveDict(
|
||||
{
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Referer": "http://dump.vstu.ru/",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Priority": "u=0, i",
|
||||
"Pragma": "no-cache",
|
||||
"Cache-Control": "no-cach",
|
||||
}
|
||||
)
|
||||
|
||||
EXCEL_LINKS = {}
|
||||
filestime = str(round(time.time()))
|
||||
for facultet in FACULTETS:
|
||||
url = RASP_PREFIX + facultet
|
||||
r = session.get(url)
|
||||
print(f"GET {url}")
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
excel_pattern = re.compile(r'\.xlsx?$')
|
||||
|
||||
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
||||
excel_tags = soup.find_all('a', href=excel_pattern)
|
||||
excel_links = [tag.get('href') for tag in excel_tags]
|
||||
|
||||
# Предположим, вы уже получили excel_links из одного из методов выше
|
||||
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
|
||||
|
||||
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
|
||||
|
||||
if facultet not in EXCEL_LINKS.keys():
|
||||
EXCEL_LINKS[facultet] = set()
|
||||
|
||||
for excel_url in absolute_links:
|
||||
EXCEL_LINKS[facultet].add(excel_url)
|
||||
print(f"+url {excel_url}")
|
||||
DEBUG_ONE_FAC = None #'fevt'
|
||||
|
||||
|
||||
result = {}
|
||||
faileds = []
|
||||
counter = 0
|
||||
for facultet in FACULTETS:
|
||||
counter += 1000
|
||||
print(f"\n\n-- Факультет '{facultet}' --")
|
||||
facultet_urls = EXCEL_LINKS[facultet]
|
||||
for excel_url in facultet_urls:
|
||||
counter += 1
|
||||
print(f"\n\n-- Ссылка --")
|
||||
print(f"{excel_url}")
|
||||
is_xlsx = excel_url.endswith(".xlsx")
|
||||
def process_excel_file(facultet, excel_url, counter, timeid):
|
||||
is_xlsx = excel_url.endswith(".xlsx")
|
||||
try:
|
||||
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
||||
aigenerated.download_file_from_url(excel_url, filename)
|
||||
|
||||
try:
|
||||
filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
||||
aigenerated.download_file_from_url(excel_url, filename)
|
||||
|
||||
reader = translations.create_reader(filename)
|
||||
reader = translations.create_reader(filename)
|
||||
print("Reader info")
|
||||
print(reader.info())
|
||||
|
||||
while True:
|
||||
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
|
||||
parser.LOGGING = False
|
||||
prs = parser.Parser(reader)
|
||||
prs.parse()
|
||||
@@ -92,33 +45,67 @@ for facultet in FACULTETS:
|
||||
if group_name in result.keys():
|
||||
print(f" -- WTF -- Doubled groups -- name: {group_name}")
|
||||
continue
|
||||
|
||||
|
||||
gr = result[group_name] = prs.groups[group_name]
|
||||
gr['facultet'] = facultet
|
||||
gr['data_source'] = excel_url.split("/")[-1]
|
||||
gr['parser_debug'] = {
|
||||
"C_COUNTER": counter,
|
||||
"timestime": filestime,
|
||||
"timeid": timeid,
|
||||
"excel_url": excel_url,
|
||||
"reader_info": reader.info(),
|
||||
"reader_sheet_index": reader.get_sheet_index(),
|
||||
"filename": filename
|
||||
}
|
||||
|
||||
print(f"Populates {len(prs.groups)} groups to result")
|
||||
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error while {excel_url}")
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
faileds.append({
|
||||
"ex": e,
|
||||
"fac": facultet,
|
||||
"url": excel_url
|
||||
})
|
||||
if not reader.has_next_sheet():
|
||||
print("File ended")
|
||||
break
|
||||
else:
|
||||
reader.next_sheet()
|
||||
print("Next sheet!")
|
||||
|
||||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
print(f"Error while {excel_url}")
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
faileds.append({
|
||||
"ex": e,
|
||||
"fac": facultet,
|
||||
"url": excel_url
|
||||
})
|
||||
|
||||
print("Faileds:")
|
||||
print(faileds)
|
||||
|
||||
print("Saved to result.json")
|
||||
result = {}
|
||||
faileds = []
|
||||
def main():
|
||||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||||
|
||||
counter = 0
|
||||
timeid = str(round(time.time()))
|
||||
for facultet in EXCEL_LINKS.keys():
|
||||
counter += 1000
|
||||
print(f"\n\n-- Факультет '{facultet}' --")
|
||||
facultet_urls = EXCEL_LINKS[facultet]
|
||||
for excel_url in facultet_urls:
|
||||
counter += 1
|
||||
print(f"\n\n-- Ссылка --")
|
||||
print(f"{excel_url}")
|
||||
|
||||
print("Start processing excel file")
|
||||
process_excel_file(facultet, excel_url, counter, timeid)
|
||||
print("Excel file processing done!")
|
||||
|
||||
print("Saving result.json")
|
||||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||||
print("Saved to result.json")
|
||||
|
||||
print("Faileds:")
|
||||
print(faileds)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
print("Bye!")
|
||||
|
||||
Reference in New Issue
Block a user