-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle.py
98 lines (80 loc) · 3.03 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from pprint import pformat
import scrapy
from peewee import CharField, DateField, TextField, IntegerField, fn
from playhouse.shortcuts import model_to_dict
from playhouse.sqlite_ext import JSONField, SearchField, FTSModel
from utils import BaseModel, db
class Article(BaseModel):
url = CharField(unique=True)
title = CharField()
description = CharField()
date = DateField()
text = TextField()
rubrics = JSONField()
themes = JSONField()
difficulty = IntegerField()
internal_links = JSONField()
external_links = JSONField()
author = CharField()
@classmethod
def by_url(cls, url):
return cls.get_or_create(defaults={'url': url},
url=url)[0]
@staticmethod
def is_parsed(url):
try:
art = Article.get(Article.url == url)
except Article.DoesNotExist:
return False
else:
return art.title is not None
@staticmethod
def iter_unparsed_urls():
for art in Article.select().where(Article.url != None, Article.title == None).select():
yield art.url
@staticmethod
def parsed_num():
return Article.select().where(Article.title != None).count()
@staticmethod
def unparsed_num():
return Article.select().where(Article.title == None).count()
class ScrapyItem(scrapy.Item):
data = scrapy.Field()
def __repr__(self):
req_fields = ['url', 'title', 'description', 'author', 'internal_links']
return pformat({f: self['data'][f] for f in req_fields})
def to_scrapy_item(self):
return self.ScrapyItem(data=model_to_dict(self))
def save(self, force_insert=False, only=None):
super(Article, self).save(force_insert, only)
ArticleIndex.index_article(self)
class ArticleIndex(FTSModel):
title = SearchField()
content = SearchField()
class Meta:
database = db
# Use our custom tokenizer
extension_options = {'tokenize': 'snowball_russian'}
@staticmethod
def index_article(article):
q = ArticleIndex.docid == article.id
if not ArticleIndex.select().where(q).count():
ArticleIndex.insert({
ArticleIndex.docid: article.id,
ArticleIndex.title: article.title,
ArticleIndex.content: article.text}).execute()
else:
ArticleIndex.update(title=article.title, content=article.text).where(q)
@staticmethod
def search_by_text(phrase):
# Query the search_by_text index and join the corresponding Document
# object on each search_by_text result.
# Adds "snippets" field containing matches
return (Article
.select(Article,
fn.snippet(ArticleIndex.as_entity(), '*', '*', '...').alias('snippets'))
.join(
ArticleIndex,
on=(Article.id == ArticleIndex.docid))
.where(ArticleIndex.match(phrase))
.order_by(ArticleIndex.bm25()))