work
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,3 +2,5 @@
|
|||||||
*.xlsx
|
*.xlsx
|
||||||
__pycache__
|
__pycache__
|
||||||
.idea
|
.idea
|
||||||
|
result.json
|
||||||
|
groups.json
|
||||||
|
|||||||
6149
groups.json
6149
groups.json
File diff suppressed because it is too large
Load Diff
53
links_parser.py
Normal file
53
links_parser.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import re
|
||||||
|
import time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import requests
|
||||||
|
from requests.structures import CaseInsensitiveDict
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
BASE_URL = "https://www.vstu.ru/"
|
||||||
|
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
||||||
|
|
||||||
|
|
||||||
|
def parse_links(facultets):
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers = CaseInsensitiveDict(
|
||||||
|
{
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||||
|
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Referer": "http://dump.vstu.ru/",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Priority": "u=0, i",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
"Cache-Control": "no-cach",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
EXCEL_LINKS = {}
|
||||||
|
for facultet in facultets:
|
||||||
|
url = RASP_PREFIX + facultet
|
||||||
|
r = session.get(url)
|
||||||
|
print(f"GET {url}")
|
||||||
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
excel_pattern = re.compile(r'\.xlsx?$')
|
||||||
|
|
||||||
|
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
||||||
|
excel_tags = soup.find_all('a', href=excel_pattern)
|
||||||
|
excel_links = [tag.get('href') for tag in excel_tags]
|
||||||
|
|
||||||
|
# Предположим, вы уже получили excel_links из одного из методов выше
|
||||||
|
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
|
||||||
|
|
||||||
|
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
|
||||||
|
|
||||||
|
if facultet not in EXCEL_LINKS.keys():
|
||||||
|
EXCEL_LINKS[facultet] = set()
|
||||||
|
|
||||||
|
for excel_url in absolute_links:
|
||||||
|
EXCEL_LINKS[facultet].add(excel_url)
|
||||||
|
print(f"+url {excel_url}")
|
||||||
|
|
||||||
|
return EXCEL_LINKS
|
||||||
137
main.py
137
main.py
@@ -8,7 +8,7 @@ import xlwt
|
|||||||
|
|
||||||
import xlrd
|
import xlrd
|
||||||
import requests
|
import requests
|
||||||
from requests.structures import CaseInsensitiveDict
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import aigenerated
|
import aigenerated
|
||||||
@@ -16,75 +16,28 @@ import parser
|
|||||||
import translations
|
import translations
|
||||||
import utils
|
import utils
|
||||||
import json
|
import json
|
||||||
|
import links_parser
|
||||||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||||||
|
|
||||||
FACULTETS = [
|
FACULTETS = [
|
||||||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||||||
]
|
]
|
||||||
BASE_URL = "https://www.vstu.ru/"
|
|
||||||
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
|
||||||
|
|
||||||
session = requests.Session()
|
DEBUG_ONE_FAC = None #'fevt'
|
||||||
session.headers = CaseInsensitiveDict(
|
|
||||||
{
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
||||||
"Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
|
|
||||||
"Accept-Encoding": "gzip, deflate",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Referer": "http://dump.vstu.ru/",
|
|
||||||
"Upgrade-Insecure-Requests": "1",
|
|
||||||
"Priority": "u=0, i",
|
|
||||||
"Pragma": "no-cache",
|
|
||||||
"Cache-Control": "no-cach",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
EXCEL_LINKS = {}
|
|
||||||
filestime = str(round(time.time()))
|
|
||||||
for facultet in FACULTETS:
|
|
||||||
url = RASP_PREFIX + facultet
|
|
||||||
r = session.get(url)
|
|
||||||
print(f"GET {url}")
|
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
|
||||||
excel_pattern = re.compile(r'\.xlsx?$')
|
|
||||||
|
|
||||||
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
|
||||||
excel_tags = soup.find_all('a', href=excel_pattern)
|
|
||||||
excel_links = [tag.get('href') for tag in excel_tags]
|
|
||||||
|
|
||||||
# Предположим, вы уже получили excel_links из одного из методов выше
|
|
||||||
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
|
|
||||||
|
|
||||||
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
|
|
||||||
|
|
||||||
if facultet not in EXCEL_LINKS.keys():
|
|
||||||
EXCEL_LINKS[facultet] = set()
|
|
||||||
|
|
||||||
for excel_url in absolute_links:
|
|
||||||
EXCEL_LINKS[facultet].add(excel_url)
|
|
||||||
print(f"+url {excel_url}")
|
|
||||||
|
|
||||||
|
|
||||||
result = {}
|
def process_excel_file(facultet, excel_url, counter, timeid):
|
||||||
faileds = []
|
is_xlsx = excel_url.endswith(".xlsx")
|
||||||
counter = 0
|
try:
|
||||||
for facultet in FACULTETS:
|
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
||||||
counter += 1000
|
aigenerated.download_file_from_url(excel_url, filename)
|
||||||
print(f"\n\n-- Факультет '{facultet}' --")
|
|
||||||
facultet_urls = EXCEL_LINKS[facultet]
|
|
||||||
for excel_url in facultet_urls:
|
|
||||||
counter += 1
|
|
||||||
print(f"\n\n-- Ссылка --")
|
|
||||||
print(f"{excel_url}")
|
|
||||||
is_xlsx = excel_url.endswith(".xlsx")
|
|
||||||
|
|
||||||
try:
|
reader = translations.create_reader(filename)
|
||||||
filename = "excels/" + facultet + filestime + f"[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
print("Reader info")
|
||||||
aigenerated.download_file_from_url(excel_url, filename)
|
print(reader.info())
|
||||||
|
|
||||||
reader = translations.create_reader(filename)
|
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
|
||||||
parser.LOGGING = False
|
parser.LOGGING = False
|
||||||
prs = parser.Parser(reader)
|
prs = parser.Parser(reader)
|
||||||
prs.parse()
|
prs.parse()
|
||||||
@@ -98,27 +51,61 @@ for facultet in FACULTETS:
|
|||||||
gr['data_source'] = excel_url.split("/")[-1]
|
gr['data_source'] = excel_url.split("/")[-1]
|
||||||
gr['parser_debug'] = {
|
gr['parser_debug'] = {
|
||||||
"C_COUNTER": counter,
|
"C_COUNTER": counter,
|
||||||
"timestime": filestime,
|
"timeid": timeid,
|
||||||
"excel_url": excel_url,
|
"excel_url": excel_url,
|
||||||
"reader_info": reader.info(),
|
"reader_info": reader.info(),
|
||||||
|
"reader_sheet_index": reader.get_sheet_index(),
|
||||||
"filename": filename
|
"filename": filename
|
||||||
}
|
}
|
||||||
|
|
||||||
print(f"Populates {len(prs.groups)} groups to result")
|
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
|
||||||
|
|
||||||
except Exception as e:
|
if not reader.has_next_sheet():
|
||||||
print(f"Error while {excel_url}")
|
print("File ended")
|
||||||
print(e)
|
break
|
||||||
traceback.print_exc()
|
else:
|
||||||
faileds.append({
|
reader.next_sheet()
|
||||||
"ex": e,
|
print("Next sheet!")
|
||||||
"fac": facultet,
|
|
||||||
"url": excel_url
|
|
||||||
})
|
|
||||||
|
|
||||||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
except Exception as e:
|
||||||
|
print(f"Error while {excel_url}")
|
||||||
|
print(e)
|
||||||
|
traceback.print_exc()
|
||||||
|
faileds.append({
|
||||||
|
"ex": e,
|
||||||
|
"fac": facultet,
|
||||||
|
"url": excel_url
|
||||||
|
})
|
||||||
|
|
||||||
print("Faileds:")
|
|
||||||
print(faileds)
|
|
||||||
|
|
||||||
print("Saved to result.json")
|
result = {}
|
||||||
|
faileds = []
|
||||||
|
def main():
|
||||||
|
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
timeid = str(round(time.time()))
|
||||||
|
for facultet in EXCEL_LINKS.keys():
|
||||||
|
counter += 1000
|
||||||
|
print(f"\n\n-- Факультет '{facultet}' --")
|
||||||
|
facultet_urls = EXCEL_LINKS[facultet]
|
||||||
|
for excel_url in facultet_urls:
|
||||||
|
counter += 1
|
||||||
|
print(f"\n\n-- Ссылка --")
|
||||||
|
print(f"{excel_url}")
|
||||||
|
|
||||||
|
print("Start processing excel file")
|
||||||
|
process_excel_file(facultet, excel_url, counter, timeid)
|
||||||
|
print("Excel file processing done!")
|
||||||
|
|
||||||
|
print("Saving result.json")
|
||||||
|
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||||||
|
print("Saved to result.json")
|
||||||
|
|
||||||
|
print("Faileds:")
|
||||||
|
print(faileds)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
print("Bye!")
|
||||||
|
|||||||
270254
result.json
270254
result.json
File diff suppressed because it is too large
Load Diff
@@ -15,7 +15,7 @@ class TranschendentnostCell:
|
|||||||
self._is_empty = is_empty
|
self._is_empty = is_empty
|
||||||
|
|
||||||
def is_empty(self):
|
def is_empty(self):
|
||||||
self._is_empty
|
return self._is_empty
|
||||||
|
|
||||||
class ExcelSheetReader(ABC):
|
class ExcelSheetReader(ABC):
|
||||||
"""
|
"""
|
||||||
@@ -25,6 +25,18 @@ class ExcelSheetReader(ABC):
|
|||||||
def __init__(self, file_path):
|
def __init__(self, file_path):
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_sheet_index(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def has_next_sheet(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def next_sheet(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_cell_value(self, row, col):
|
def get_cell_value(self, row, col):
|
||||||
"""Возвращает значение ячейки по 0-индексированным координатам."""
|
"""Возвращает значение ячейки по 0-индексированным координатам."""
|
||||||
@@ -96,8 +108,23 @@ class ExcelSheetReader(ABC):
|
|||||||
class XlrdSheetReader(ExcelSheetReader):
|
class XlrdSheetReader(ExcelSheetReader):
|
||||||
def __init__(self, file_path, sheet_index=0):
|
def __init__(self, file_path, sheet_index=0):
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
|
self.sheet_index = sheet_index
|
||||||
self.book = xlrd.open_workbook(file_path, formatting_info=True)
|
self.book = xlrd.open_workbook(file_path, formatting_info=True)
|
||||||
self.sheet = self.book.sheet_by_index(sheet_index)
|
self.init_sheet()
|
||||||
|
|
||||||
|
def get_sheet_index(self):
|
||||||
|
return self.sheet_index
|
||||||
|
|
||||||
|
def init_sheet(self):
|
||||||
|
self.sheet = self.book.sheet_by_index(self.sheet_index)
|
||||||
|
|
||||||
|
def has_next_sheet(self):
|
||||||
|
return self.sheet_index < len(self.book.sheet_names())-1
|
||||||
|
|
||||||
|
def next_sheet(self):
|
||||||
|
if self.has_next_sheet():
|
||||||
|
self.sheet_index += 1
|
||||||
|
self.init_sheet()
|
||||||
|
|
||||||
def get_cell_value(self, row, col):
|
def get_cell_value(self, row, col):
|
||||||
# Проверка на выход за пределы таблицы, чтобы избежать ошибок
|
# Проверка на выход за пределы таблицы, чтобы избежать ошибок
|
||||||
@@ -106,9 +133,9 @@ class XlrdSheetReader(ExcelSheetReader):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def info(self):
|
def info(self):
|
||||||
print("The number of worksheets is {0}".format(self.book.nsheets))
|
return """[XLRD (.xls)] The number of worksheets is {0}
|
||||||
print("Worksheet name(s): {0}".format(self.book.sheet_names()))
|
Worksheet name(s): {1}
|
||||||
return "'{0}': size: {1}x{2} names: ".format(self.sheet.name, self.sheet.nrows, self.sheet.ncols, " ".join(self.book.sheet_names()))
|
'{2}': size: {3}x{4}""".format(self.book.nsheets, self.book.sheet_names(), self.sheet.name, self.sheet.nrows, self.sheet.ncols)
|
||||||
|
|
||||||
def cell(self, row, col):
|
def cell(self, row, col):
|
||||||
"""Возвращает абстрактную клетку"""
|
"""Возвращает абстрактную клетку"""
|
||||||
@@ -150,8 +177,9 @@ class XlrdSheetReader(ExcelSheetReader):
|
|||||||
class OpenpyxlSheetReader(ExcelSheetReader):
|
class OpenpyxlSheetReader(ExcelSheetReader):
|
||||||
def __init__(self, file_path, sheet_name=None):
|
def __init__(self, file_path, sheet_name=None):
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
|
self.sheet_index = 0
|
||||||
self.workbook = openpyxl.load_workbook(file_path, data_only=True)
|
self.workbook = openpyxl.load_workbook(file_path, data_only=True)
|
||||||
self.sheet = self.workbook[sheet_name] if sheet_name else self.workbook.active
|
self.init_sheet()
|
||||||
|
|
||||||
# Словарь для трансляции стилей границ openpyxl в числовые коды xlrd
|
# Словарь для трансляции стилей границ openpyxl в числовые коды xlrd
|
||||||
self.BORDER_STYLE_MAP = {
|
self.BORDER_STYLE_MAP = {
|
||||||
@@ -161,6 +189,26 @@ class OpenpyxlSheetReader(ExcelSheetReader):
|
|||||||
'slantDashDot': 13
|
'slantDashDot': 13
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_sheet_index(self):
|
||||||
|
return self.sheet_index
|
||||||
|
|
||||||
|
def has_next_sheet(self):
|
||||||
|
return self.sheet_index < len(self.workbook.sheetnames)-1
|
||||||
|
|
||||||
|
def next_sheet(self):
|
||||||
|
if self.has_next_sheet():
|
||||||
|
self.sheet_index += 1
|
||||||
|
self.init_sheet()
|
||||||
|
|
||||||
|
def init_sheet(self):
|
||||||
|
self.sheet = self.workbook[self.workbook.sheetnames[self.sheet_index]]
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
return """[OpenPyXL (.xlsx)] The number of worksheets is {0}
|
||||||
|
Worksheet name(s): {1}
|
||||||
|
'{2}': size: {3}x{4}""".format(len(self.workbook.sheetnames), self.workbook.sheetnames, self.sheet, self.sheet.max_row, self.sheet.max_column)
|
||||||
|
|
||||||
|
|
||||||
def _get_cell(self, row, col):
|
def _get_cell(self, row, col):
|
||||||
"""Внутренний метод для получения ячейки с преобразованием координат."""
|
"""Внутренний метод для получения ячейки с преобразованием координат."""
|
||||||
# openpyxl использует 1-индексированную систему
|
# openpyxl использует 1-индексированную систему
|
||||||
|
|||||||
Reference in New Issue
Block a user