Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Обновлен сбор онлайн курсов #223

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions application/onlinecourse/admin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from django.contrib import admin

from .models import Institution, Platform, OnlineCourse
from .models import OnlineCourse

admin.site.register(Institution)
admin.site.register(Platform)
admin.site.register(OnlineCourse)
72 changes: 16 additions & 56 deletions application/onlinecourse/models.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,49 @@
from django.db import models
from dataprocessing.models import Items

class Institution(models.Model):
"""
Модель для правообладателей онлайн курсов
"""
title = models.CharField(max_length=1024, verbose_name='Название', blank=False, null=False)
id_from_roo = models.CharField(max_length=1024, verbose_name='ID, который приходит от РОО', blank=True, null=True)

class Meta:
verbose_name = 'Правообладатель'
verbose_name_plural = 'Правообладатели'

def __str__(self):
return self.title


class Platform(models.Model):
"""
Модель для платформ, на которых размещены онлайн курсы
"""
title = models.CharField(max_length=1024, verbose_name='Название', blank=False, null=False)
id_from_roo = models.CharField(max_length=1024, verbose_name='ID, который приходит от РОО', blank=True, null=True)

class Meta:
verbose_name = 'Платформа'
verbose_name_plural = 'Платформы'

def __str__(self):
return self.title


class OnlineCourse(models.Model):
"""
Модель онлайн курса
"""
title = models.CharField(max_length=1024, verbose_name='Название', blank=False, null=False)
id_from_roo = models.CharField(max_length=1024, verbose_name='ID, который приходит от РОО', blank=True, null=True)
title = models.CharField(max_length=4000, verbose_name='Название', blank=False, null=False)
id_from_roo = models.CharField(max_length=4000, verbose_name='ID, который приходит от РОО', blank=True, null=True)
description = models.TextField(verbose_name='Описание', blank=False, null=False)
institution = models.ForeignKey('Institution', on_delete=models.CASCADE, verbose_name="Правообладатель", blank=False,
null=False)
platform = models.ForeignKey('Platform', on_delete=models.CASCADE, verbose_name="Платформа", blank=False,
null=False)
institution = models.CharField(max_length=4000, verbose_name='Правообладатель', blank=False, null=False)
platform = models.CharField(max_length=4000, verbose_name='Платформа', blank=False, null=False)
LanguageChoices = [
('ru', 'Русский'),
('en', 'Английский'),
('ru/en', 'Русский/Английский'),
]
language = models.CharField(
max_length=5,
max_length=30,
choices=LanguageChoices,
verbose_name='Язык онлайн курса',
blank=False, null=False
)
started_at = models.DateField(blank=True, null=True, verbose_name='Дата начала курса')
created_at = models.DateField(blank=True, null=True, verbose_name='Дата создания курса')
record_end_at = models.DateField(blank=True, null=True, verbose_name='Дата окончания записи на курс')
finished_at = models.DateField(blank=True, null=True, verbose_name='Дата окончания курса')
started_at = models.CharField(max_length=4000, blank=True, null=True, verbose_name='Дата начала курса')
record_end_at = models.DateField(null=True, verbose_name='Дата окончания записи на курс')
finished_at = models.DateField(null=True, verbose_name='Дата окончания курса')
rating = models.FloatField(blank=True, null=True, verbose_name='Рейтинг пользователей')
experts_rating = models.FloatField(blank=True, null=True, verbose_name='Рейтинг экспертов')
visitors_number = models.IntegerField(blank=True, null=True,
verbose_name='Количество записавшихся на текущую сессию')
total_visitors_number = models.IntegerField(blank=True, null=True,
verbose_name='Количество записавшихся на все сессии онлайн курса')
duration = models.IntegerField(blank=True, null=True, verbose_name='Длительность онлайн курса, недель')
volume = models.IntegerField(blank=True, null=True, verbose_name='Объем онлайн курса, часов')
intensity_per_week = models.IntegerField(blank=True, null=True,
verbose_name='Требуемое время для изучения онлайн-курса, часов в неделю')
content = models.TextField(blank=True, null=True, verbose_name='Содержание онлайн курса')
content = models.CharField(max_length=40000, blank=True, null=True, verbose_name='Содержание онлайн курса')
lectures_number = models.IntegerField(blank=True, null=True, verbose_name='Количество лекций')
external_url = models.URLField(blank=True, null=True, verbose_name='Ссылка на онлайн курс')
has_certificate = models.BooleanField(blank=True, null=True, verbose_name='Возможность получить сертификат')
external_url = models.CharField(max_length=4000, blank=True, null=True, verbose_name='Ссылка на онлайн курс')
roc_url = models.CharField(max_length=4000, blank=True, null=True, verbose_name='Ссылка на онлайн курс online.edu.ru')
has_certificate = models.CharField(max_length=40, blank=True, null=True, verbose_name='Возможность получить сертификат')
credits = models.FloatField(blank=True, null=True, verbose_name='Трудоемкость курса в з.е.')
requirements = models.TextField(verbose_name='Требования', blank=True, null=True)
competences = models.TextField(verbose_name='Компетенции', blank=True, null=True)
learning_outcome = models.TextField(verbose_name='Результаты', blank=True, null=True)
requirements = models.CharField(max_length=40000, verbose_name='Требования', blank=True, null=True)
competences = models.CharField(max_length=40000, verbose_name='Компетенции', blank=True, null=True)
learning_outcome = models.CharField(max_length=40000, verbose_name='Результаты', blank=True, null=True)
learning_outcome_list = models.ManyToManyField(Items)

actual = models.BooleanField(blank=False, default=True, verbose_name='Актуальный курс')

class Meta:
verbose_name = 'Онлайн курс'
verbose_name_plural = 'Онлайн курсы'

def __str__(self):
return self.title



191 changes: 191 additions & 0 deletions application/onlinecourse/onlinecourse_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from datetime import datetime


online_edu_url = 'https://online.edu.ru'


def get_course_links(url):
"""
По ссылке на платформу online.edu.ru собирает ссылки на страницу каждого курса в список
"""
course_links = []
for i in range(0, 10**9):
time.sleep(0.5)
main_url = url + '/public/courses.xhtml?page=' + str(i) + '&name=asc'
print(main_url)
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
html_text = session.get(main_url)
soup = bs(html_text.text, "html.parser")
print(i)
if soup.body.find_all(class_="ui-outputpanel ui-widget course-name"):
course_widgets = soup.body.find_all(class_="ui-outputpanel ui-widget course-name")
for k in course_widgets:
course_links.append(k.find('a').get('href'))
else:
break
return course_links


def get_course_info(course_url):
"""
По ссылке на конкретный курс собирается вся информация о курсе в датафрейм
"""
time.sleep(5)
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
html_text = session.get(course_url)
soup = bs(html_text.text, "html.parser")
archived = soup.find(class_="archived-label")
title = ''
id_from_roo = ''
description = ''
institution = ''
platform = ''
language = ''
started_at = None
record_end_at = None
finished_at = None
rating = 0
visitors_number = 0
duration = 0
content = ''
lectures_number = 0
external_url = ''
roc_url = course_url
has_certificate = ''
credits = 0
requirements = ''
competences = ''
learning_outcome = ''
actual = True
try:
if archived is None:
course_info = soup.find(class_="course-info")
title = re.search(r'.*', str(course_info.find(class_="course-name").text)).group(0)
course_info_table = course_info.find_all(class_="ui-panelgrid-cell")
course_info_table_list = []
for i in course_info_table:
course_info_table_list.append(i.text)
started_at = course_info_table_list[3]
platform = re.sub(r' *', '', re.sub(r'\n', '', course_info_table_list[5]))
button_link = course_info.find(
class_="ui-button ui-widget ui-state-default ui-corner-all ui-button-text-only rc-action-button")
external_url = re.search(r"'.*'", button_link.get('onclick')).group(0)
course_main = soup.find(class_="course-view-main content")
institution = course_main.find(id="j_idt260:j_idt290").find('span').text
if course_main.find(id="j_idt260:j_idt264"):
rating = float(re.sub(',', '.', re.search(r'\d.*', course_main.find(id="j_idt260:j_idt264").text).group(0)))
description = re.sub(r'\nО курсе', '', course_main.find(class_="ui-outputpanel ui-widget").text)
if course_main.find(id="j_idt123:0:j_idt124"):
requirements = course_main.find(id="j_idt123:0:j_idt124").text
if course_main.find(id="j_idt130"):
content = course_main.find(id="j_idt130").text
if course_main.find(id="course-view-competences"):
competences = course_main.find(id="course-view-competences")
list_comp = competences.find_all('li')
text_list_comp = []
for i in list_comp:
text_list_comp.append(i.text)
competences = '\n '.join(text_list_comp)
if course_main.find(id="course-view-results"):
results = course_main.find(id="course-view-results")
list_results = results.find_all('li')
text_list_results = []
for i in list_results:
text_list_results.append(i.text)
learning_outcome = '\n '.join(text_list_results)
course_add_info_table = course_main.find_all(class_="ui-panelgrid-cell")
for i in range(0, len(course_add_info_table)):
if course_add_info_table[i].text == 'Количество лекций':
if str(course_add_info_table[i + 1].text) != '':
lectures_number = int(course_add_info_table[i + 1].text)
elif course_add_info_table[i].text == 'ID курса':
id_from_roo = course_add_info_table[i + 1].text
elif course_add_info_table[i].text == 'Дата ближайшего старта':
started_at = course_add_info_table[i + 1].text
elif course_add_info_table[i].text == 'Дата окончания':
finished_at = datetime.strptime(course_add_info_table[i + 1].text, '%d.%m.%Y').date()
elif course_add_info_table[i].text == 'К-во обучающихся на версии курса':
if str(course_add_info_table[i + 1].text) != '':
visitors_number = int(course_add_info_table[i + 1].text)
elif course_add_info_table[i].text == 'Язык':
language = course_add_info_table[i + 1].text
elif course_add_info_table[i].text == 'Длительность':
if re.search('\d*', course_add_info_table[i + 1].text)[0] != '':
duration = int(re.search('\d*', course_add_info_table[i + 1].text)[0])
elif course_add_info_table[i].text == 'Сертификат':
has_certificate = course_add_info_table[i + 1].text
elif course_add_info_table[i].text == 'Дата окончания записи':
record_end_at = datetime.strptime(course_add_info_table[i + 1].text, '%d.%m.%Y')
elif course_add_info_table[i].text == 'Трудоёмкость в з.е.':
credits = float(course_add_info_table[i + 1].text)
df = pd.DataFrame([[id_from_roo,
title,
description,
institution,
platform,
language,
started_at,
record_end_at,
finished_at,
rating,
visitors_number,
duration,
content,
lectures_number,
external_url,
roc_url,
has_certificate,
credits,
requirements,
learning_outcome,
competences,
actual]], columns=['id_from_roo', 'title', 'description', 'institution',
'platform', 'language', 'started_at', 'record_end_at',
'finished_at', 'rating', 'visitors_number', 'duration',
'content', 'lectures_number', 'external_url', 'roc_url',
'has_certificate', 'credits', 'requirements', 'learning_outcome',
'competences', 'actual'])
return df
else:
pass
except Exception as e:
print("Во время парсинга курса", title, 'произошла ошибка')
print(e)


def get_all_data(url):
"""
Функция сбора данных об онлайн курсах в датафрейм
"""
print('Собираем ссылки на курсы')
course_links = get_course_links(url)
print('Количество онлайн курсов', len(course_links))
print("Собираем сами курсы")
df = pd.DataFrame(
columns=['id_from_roo', 'title', 'description', 'institution', 'platform', 'language',
'started_at', 'record_end_at', 'finished_at', 'rating',
'visitors_number', 'duration',
'content', 'lectures_number', 'external_url', 'roc_url',
'has_certificate', 'credits', 'requirements', 'learning_outcome',
'competences', 'actual'])
for i in course_links:
print(url + i)
df = pd.concat([df, get_course_info(url + i)], ignore_index=True)
print(df.shape)
#df.to_csv('df_1954.csv')
return df
19 changes: 1 addition & 18 deletions application/onlinecourse/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from rest_framework import serializers
from .models import Institution, Platform, OnlineCourse
from .models import OnlineCourse
from workprogramsapp.models import CourseCredit, CourseFieldOfStudy, FieldOfStudy, Topic, DisciplineSection, WorkProgram
from dataprocessing.models import Items

Expand All @@ -13,24 +13,9 @@ class Meta:
fields = "__all__"


class InstitutionSerializer(serializers.ModelSerializer):
"""Сериализатор Правообладателей"""
class Meta:
model = Institution
fields = '__all__'


class PlatformSerializer(serializers.ModelSerializer):
"""Сериализатор Платформ"""
class Meta:
model = Platform
fields = '__all__'


class CourseCreditSerializer(serializers.ModelSerializer):
"""Сериализатор Перезачетов"""
course = serializers.SlugRelatedField(slug_field="title", read_only=True)
institution = InstitutionSerializer()
field_of_study = FieldOfStudySerializer(many=False)

class Meta:
Expand Down Expand Up @@ -85,8 +70,6 @@ class OnlineCourseSerializer(serializers.ModelSerializer):
"""Сериализатор Онлайн курса"""
course_field_of_study = CourseFieldOfStudySerializer(many=True)
course_credit = CourseCreditSerializer(many=True)
institution = InstitutionSerializer(many=False)
platform = PlatformSerializer(many=False)
learning_outcome_list = ItemsForOnlineCourseSerializer(many=True)
topic_with_online_course = OnlineCourseInTopics(many=True)

Expand Down
Loading