This repository has been archived by the owner on Feb 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
lemma_matcing.py
69 lines (56 loc) · 2.24 KB
/
lemma_matcing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from __future__ import division
from xml.etree.ElementTree import ElementTree
from xml.etree.cElementTree import parse as xmlparse
#preprocessed_data = parse_preprocessed_xml('rte2_dev_data/RTE2_dev.preprocessed.xml')
#data = parse_xml('rte2_dev_data/RTE2_dev.xml')
def clean(word):
return word.strip(",. ")
def lemma_matching(text, hypothesis):
lemmastext = [n.lemma for s in text for n in s.nodes if n.isWord]
lemmashyp = [n.lemma for s in hypothesis for n in s.nodes if n.isWord]
hypintext = filter(lambda x: x in lemmastext, lemmashyp)
p = float(len(hypintext)) / len(lemmashyp)
return p
class Pair(object):
def __init__(self, etree):
self.id = etree.attrib['id'].strip()
self.tast = etree.attrib['task'].strip()
self.text = [Sentence(s) for s in etree.iterfind('text/sentence')]
self.hypothesis = [Sentence(s) for s in etree.iterfind('hypothesis/sentence')]
self.entailment = etree.attrib['entailment']
class Sentence(object): # list of nodes
def __init__(self, etree):
self.serial = etree.attrib['serial'].strip()
self.nodes = [Node(n) for n in etree.iterfind('node')]
class Node(object):
def __init__(self, etree):
self.id = etree.attrib['id']
if self.id[0] == 'E': # artificial node
self.isWord = False
else:
self.isWord = True
self.word = etree.findtext('word').strip()
self.lemma = etree.findtext('lemma').strip()
self.postag = etree.findtext('pos-tag').strip()
self.relation = etree.findtext('relation')
if self.relation: self.relations = self.relation.strip()
def parse_preprocessed_xml(fileh):
pair = None
etree = xmlparse(fileh)
pairs = []
for pair in etree.iterfind('pair'):
pairs.append(Pair(pair))
return pairs
def traverse_preprocessed(pairs, function, threshold):
correct = 0
print "ranked: no"
for pair in pairs:
print pair.id,
if function(pair.text, pair.hypothesis) > threshold:
print 'YES'
else:
print 'NO'
if __name__ == '__main__':
import sys
data =parse_preprocessed_xml(sys.argv[1])
traverse_preprocessed(data, lemma_matching, float(sys.argv[2]))