Skip to content

Commit

Permalink
modify the parsing result structure of ThiVien site
Browse files Browse the repository at this point in the history
  • Loading branch information
u8621011 committed Aug 7, 2018
1 parent b676f1a commit 6ed4e25
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 17 deletions.
25 changes: 10 additions & 15 deletions pyVitk/crawler/ThiVienParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,11 @@

""""Parser for http://hvdic.thivien.net
Will try to parse out the han viet relationship
Return samples:
[
{
"ChineseWord": "文",
"HanViet": "văn",
}, {
"ChineseWord": "文",
"HanViet": "vấn",
}
]
"""

import requests
from bs4 import BeautifulSoup
from pyVitk.DictionaryLexicon import DictionaryLexicon


def parse_hanviet(w):
Expand All @@ -31,13 +21,18 @@ def parse_hanviet(w):
data = r.text
soup = BeautifulSoup(data, 'lxml')

lex = DictionaryLexicon()
lex.source_language = 'zh-TW'
lex.target_language = 'vi-VN'
lex.source_title = w
for info_div in soup.find_all('div', class_='info'):
hanviet_spans = info_div.find_all('span')
for s in hanviet_spans:
result_bank.append({
'ChineseWord': w,
'HanViet': s.string,
lex.pron_systems.append({
'name': 'HanViet',
'pronunciation': s.string,
})

return result_bank
result_bank.append(lex)

return result_bank
10 changes: 8 additions & 2 deletions pyVitk/test/ThiVienParserTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,11 @@

class ThiVienParserTest(TestCase):
def test_parse_from_tchinese(self):
hanviets = parse_hanviet('文')
self.assertEqual(len(hanviets), 2)
lexs = parse_hanviet('文')

self.assertEqual(len(lexs), 1)
self.assertEqual(lexs[0].source_language, 'zh-TW')
self.assertEqual(lexs[0].target_language, 'vi-VN')
self.assertEqual(lexs[0].source_title, '文')
self.assertEqual(len(lexs[0].pron_systems), 2)

0 comments on commit 6ed4e25

Please sign in to comment.