From 4b90d6462ac6a61232695afdb0684f1d2da80460 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 09:58:00 +0000 Subject: [PATCH] parse english and welsh only division counts Fixes #63 --- pyscraper/new_hansard.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index bbfa0571f..229405064 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -885,6 +885,21 @@ def parse_division(self, division): continue self.parse_para(para) + # FIXME - we should actually store the numbers + england_tags = division.xpath('./ns:EnglandWales/ns:hs_Para/* | ./ns:England/ns:hs_Para/*', namespaces=self.ns_map) + if len(england_tags): + self.mark_xpath_all_seen(division, './ns:EnglandWales | ./ns:England') + self.mark_xpath_all_seen(division, './ns:EnglandWales/ns:hs_Para | ./ns:England/ns:hs_Para') + details = etree.Element('p') + text = '' + for england_tag in england_tags: + self.mark_seen(england_tag) + content = tag.text + if content: + text += content + details.text = text + self.current_speech.append(details) + def parse_time(self, tag): self.mark_seen(tag) time_txt = u''.join(tag.xpath('.//text()'))