Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
laeubli committed Apr 21, 2017
0 parents commit 3306468
Show file tree
Hide file tree
Showing 11,575 changed files with 34,908 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.DS_Store
.pyc
__pycache__
154 changes: 154 additions & 0 deletions classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/python3

import os
import glob
import random
from math import log10
from collections import defaultdict

class NBClassifier:
"""Implements a Naïve Bayes Classifier"""

def __init__(self, **documents):
"""
Trains the classifier.
@type documents: {str: list(str)}
@param documents: class label -> [path_to_doc_1, path_to_doc_2, ...]
"""
self._classes = documents.keys()
self._vocabulary = set()
self._word_freqs = {c: defaultdict(int) for c in self._classes}
self._logprior = defaultdict(float)
self._loglikelihood = {c: defaultdict(float) for c in self._classes}
# read word frequencies from training data
num_docs_of_any_class = sum([len(d) for d in documents.values()])
for current_class, docs in documents.items():
for doc in docs:
for word, frequency in self._get_word_freqs(doc).items():
self._word_freqs[current_class][word] += frequency
self._vocabulary.add(word)
for current_class in self._classes:
num_docs_of_current_class = len(documents[current_class])
self._logprior[current_class] = log10(num_docs_of_current_class / num_docs_of_any_class)
num_words = sum(self._word_freqs[current_class].values())
num_words += len(self.vocabulary()) # add-1 smoothing
for word in self.vocabulary():
self._loglikelihood[current_class][word] = log10((self._word_freqs[current_class][word] + 1) / num_words)

def classify(self, document):
"""
Returns the most likely class label for @param document.
"""
# helper: calculate log probability by class
def _get_log_probability(current_class, document):
log_probability = self._logprior[current_class]
for word in self._get_words(document):
if word in self.vocabulary(): # slow, but easy to understand
log_probability += self._loglikelihood[current_class][word]
return log_probability
log_probabilities_per_class = {c: _get_log_probability(c, document) for c in self.classes()}
return sorted(log_probabilities_per_class, key=log_probabilities_per_class.get, reverse=True)[0]

def evaluate(self, **documents):
"""
Evaluates the classifier.
@type documents: {str: list(str)}
@param documents: class label -> [path_to_doc_1, path_to_doc_2, ...]
Returns overall classification accuracy.
"""
num_items = 0
num_correct = 0
tp = defaultdict(int) # true positive
fp = defaultdict(int) # false positive
fn = defaultdict(int) # false negative
for true_label, docs in documents.items():
for doc in docs:
num_items += 1
predicted_label = self.classify(doc)
if true_label == predicted_label:
num_correct += 1
tp[true_label] += 1
else:
fn[true_label] += 1
fp[predicted_label] += 1
# overall classification accuracy
accuracy = num_correct / num_items
print("Classifier accuracy: {:0.2f}%.".format(accuracy*100))
# precision, recall, f-measure per class
for c in self.classes():
precision = tp[c] / (tp[c] + fp[c])
recall = tp[c] / (tp[c] + fn[c])
f1score = 2 * ((precision * recall) / (precision + recall))
print("Class {}:\n\t{:0.2f} precision\n\t{:0.2f} recall\n\t{:0.2f} F1-score".format(
c, precision, recall, f1score
))

def vocabulary(self):
"""
Returns the classifier's vocabulary.
"""
return self._vocabulary

def classes(self):
"""
Returns the class labels this classifier can assign.
"""
return self._classes

@staticmethod
def _get_words(path_to_email):
"""
Reads an email stored at @param path_to_email. Returns the words it
contains as a list.
"""
words = []
with open(path_to_email, 'r') as f:
for line in f:
for word in line.split():
words.append(word)
return words

@staticmethod
def _get_word_freqs(path_to_email):
"""
Reads an email stored at @param path_to_email. Returns the words it
contains, alongside their frequency.
"""
word_freqs = defaultdict(int)
with open(path_to_email, 'r') as f:
for line in f:
for word in line.split():
word_freqs[word] += 1
return word_freqs


if __name__ == "__main__":
"""
Trains a Naïve Bayes classifier on 9/10 of the ham and spam documents. Uses
the remainder for evaluation.
"""
# read data, shuffle, and split into training and evaluation set
docs = glob.glob('data/bare/*/*.txt')
num_docs = len(docs)
num_eval = int(num_docs / 10)
num_train = num_docs - num_eval
random.shuffle(docs)
def format(docs):
formatted_docs = {'ham': [], 'spam': []}
for doc in docs:
class_label = 'spam' if os.path.basename(doc).startswith('spmsg') else 'ham'
formatted_docs[class_label].append(doc)
return formatted_docs
docs_eval = format(docs[:num_eval])
docs_train = format(docs[num_eval:])
print("Found {0} emails. Using {1} for training, {2} for evaluation."
.format(num_docs, num_train, num_eval))
print("Training...")
# train
classifier = NBClassifier(**docs_train)
# evaluate
print("Evaluating...")
classifier.evaluate(**docs_eval)
3 changes: 3 additions & 0 deletions data/bare/part1/3-1msg1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: re : 2 . 882 s - > np np

> date : sun , 15 dec 91 02 : 25 : 02 est > from : michael < mmorse @ vm1 . yorku . ca > > subject : re : 2 . 864 queries > > wlodek zadrozny asks if there is " anything interesting " to be said > about the construction " s > np np " . . . second , > and very much related : might we consider the construction to be a form > of what has been discussed on this list of late as reduplication ? the > logical sense of " john mcnamara the name " is tautologous and thus , at > that level , indistinguishable from " well , well now , what have we here ? " . to say that ' john mcnamara the name ' is tautologous is to give support to those who say that a logic-based semantics is irrelevant to natural language . in what sense is it tautologous ? it supplies the value of an attribute followed by the attribute of which it is the value . if in fact the value of the name-attribute for the relevant entity were ' chaim shmendrik ' , ' john mcnamara the name ' would be false . no tautology , this . ( and no reduplication , either . )
3 changes: 3 additions & 0 deletions data/bare/part1/3-1msg2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: s - > np + np

the discussion of s - > np + np reminds me that some years ago i read , in a source now forgotten , a critique of some newsmagazines ' unique tendencies in writing style , most of which the writer found overly " cute " . one item was tersely put down as follows : " time 's favorite : the colon . " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - lee hartman ga5123 @ siucvmb . bitnet department of foreign languages southern illinois university carbondale , il 62901 u . s . a .
3 changes: 3 additions & 0 deletions data/bare/part1/3-1msg3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: 2 . 882 s - > np np

. . . for me it 's much more restrictive than s - > np np . it 's " no " np pro quite an over-restriction , that .
3 changes: 3 additions & 0 deletions data/bare/part1/3-375msg1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: gent conference

" for the listserv " international conference 1992 second circular : february 1992 literature and the analysis of discourse with special attention to the multicultural context tuesday 8 september - friday 11 september 1992 gent university , belgium writing and reading literature , oral literary traditions , dialogic text , non-literary narratives , discourse theory , literature as social practice , etc . , etc . , etc . keynote speakers : david birch ( murdoch , australia ) martin montgomery ( strathclyde , scotland ) elinor ochs ( los angeles , usa ) statement of pala ' s aims pala 's principal aim is to encourage cooperation between scholars and teachers interested in language and / or literary studies . the interests of pala members are wide , and this is reflected in papers given at pala conferences . interests of members include : stylistics , literary theory , the teaching of language and literature , critical linguistics , pragmatics , discours analysis , textual understanding , rhetoric , narratology , semiotic approaches to text and performance , sociolinguistics , cultural studies , post-structuralist theory ; in short , any theme which has relevance to the study and teaching of language and literature and their role in society . the 1992 conference theme to highlight the currently expanding field of discours studies , the 1992 conference has as its core theme ' literature and the analysis of discourse , with special attention to the multicultural context ' . papers covering interests as wide as the processes of writing and reading literature , the analysis of dialogic text , oral literary traditions , the relationship between literary and non-literary discourse , discourse theory and literary communication as social practice have all been proposed , as well as those dealing specifically with the writing and reading of literature in a multilingual and / or multicultural context . the 1992 conference venue gent university is of the city type ; there is no campus , and university buildings are dotted around the town . conference sessions will take place in the hoveniersberg , overlooking the bovenschelde in one of the quiet parts of town . programme conference sessions will start on the morning of the wednesday and last a full three days . it is envisaged that most participants will arrive and register on the tuesday evening . our provisional programme looks like this : tuedsday 8 sept 15 . 00 onwards : registration wednesday 9 sept 08 . 30 - 09 . 30 : late registration 09 . 45 : opening of conference 10 . 00 - 18 . 00 : conference sessions 18 . 30 : pre-booked dinner 20 . 15 : drinks reception thursday 10 sept 08 . 30 - 18 . 00 : conference sessions 18 . 30 : pala agm 20 . 00 : pre-booked dinner friday 11 sept 08 . 30 - 17 . 00 : conference sessions 17 . 15 : wind-up session evening : activities to be arranged there will be continuous coffee , tea , etc . throughout the conference sessions . accommodation rooms in the vermeylen student hall of residence , a couple of hundred metres from the conference centre , are available to all participants . it is possible to book rooms for several nights either side of the conference dates . the price on the registration form includes breakfast . unfortunately , no double rooms are available . if you would prefer to stay in a hotel , we recommend the arcade hotel ( nederkouter , 9000 gent ; tel . 32-91 - 25 . 07 . 07 ) , which is only 10 minutes ' walk from the conference centre . alternatively , you can contact the gent tourist office ( meersstraat 138 , 9000 gent ; tel . 32-91 - 25 . 35 . 55 ) . food breakfast will be served in the overpoort , the university eating complex next door to the vermeylen . lunch and supper is also available there to conference participants , as are snacks throughout the day . there will be no single ' conference dinner ' as such , but to make it easier for participants to meet each other , we are arranging dinners for both wednesday and thursday evenings in the university restaurant . these have to be pre-booked . staying in gent gent ( population around 230 , 000 ) is a historic flemish city , the first in europe to declare itself independent of feudal control . it has a plethora of medieval vistas and bridges and is thus entitled to compete with bruges and amsterdam for the title of ' venice of the north ' . it is also a busy industrial city and the commercial and administrative centre for east flanders . the first language is flemish / dutch ( depending on one 's sociolinguistic viewpoint ) but nearly every-body can use both english and french with at least some degree of fluency . there are numerous restaurants , cafes and pubs near the conference area ( including two good vegetarian restaurants ) , many of which stay open well into the small hours . prices are cheap by northern european standards . for those wishing to combine the conference with a visit to gent and the surrounding area , you may like to know that a train can take you in less than an hour to bruges , brussels , antwerp or the belgian coast . you can even get into the ardennes or to paris within a few hours . registration / queries to attend the conference , fill in the registration form and return it , with payment , by 1st may . confirmation of registration and details of arrangements will be sent in the third circular to those who have registered , but if you have any enquiries , contact jim o'driscoll or stef slembrouck at seminarie voor engelse taalkunde , universiteit gent , rozier 44 , b-9000 gent , belgium ( tel : 32-91 - 64 . 37 . 88 / 89 / 90 ; fax : 32-91 - 64 . 41 . 95 ; e-mail pala92 @ engllang . rug . ac . be ) . * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * pala 92 gent university registration form surname _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ first name ( s ) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ address _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ affiliation _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ i will participate in the conference and enclose a eurocheque ( or have arranged direct transfer to the pala account in belgium ) to cover : ( tick as appropriate ) pala member conference fee ( bf 1000 ) _ _ _ _ _ _ non-member conference fee ( bf 2000 ) _ _ _ _ _ _ student conference fee ( bf 600 ) _ _ _ _ _ _ dinner on 9th september ( bf 500 ) _ _ _ _ _ _ dinner on 10th september ( bf 500 ) _ _ _ _ _ _ accommodation for tue 8th september ( bf 525 ) _ _ _ _ _ _ accommodation for wed 9th september ( bf 525 ) _ _ _ _ _ _ accommodation for thu 10th september ( bf 525 ) _ _ _ _ _ _ accommodation for fri 11th september ( bf 525 ) _ _ _ _ _ _ accommodation for ( specify ) ( bf ) _ _ _ _ _ _ fee for international money transfer or cheque other than eurocheques * ( bf 300 ) _ _ _ _ _ _ i therefore enclose ( or have transferred ) a total of bf _ _ _ _ _ _ i would like lacto-vegetarian / vegan food for the dinner ( s ) i have booked _ _ _ _ _ signature _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ please return to pala conference 1992 , seminarie voor engelse taalkunde , universiteit gent , rozier 44 , b-9000 gent , belgium ( pala9 @ engllang . rug . ac . be ) . the final date for registration is 1st may 1992 . _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ * . note that all payments must be made in belgian francs . cheques should be made payable to ' pala conference 1992 ' . a single eurocheque must not be of more than bf 7 , 000 . international money transfers should be sent via ' swift ' , quoting our bank 's swift number ( bbru be bb 900 ) and our account number : bbl 390-0959358 - 83 . if you have any problems with either method of payment , please contact the organizers .
3 changes: 3 additions & 0 deletions data/bare/part1/3-378msg1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: query : causatives in korean

could anyone point me to any books and articles about causative constructions in korean ? please send an e-mail directly to me . thanks you ! hiromi morikawa hiromi @ psych . stanford . edu
3 changes: 3 additions & 0 deletions data/bare/part1/3-378msg2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: l2 learning / cultural empathy

a graduate student in education approached a colleague of mine with a query which linguist people may be able to help with . he is doing an evaluation of an exchange program with indonesia one object of which is to prepare more high school teachers of indonesia here in australia . he wondered if there is anything written on the correlation of degree of acquisition of an l2 with degree of empathy and participation in the culture of the l2 speakers . high involvement / skills might seem to be a good thing for teachers in both areas ; however negative aspects might be there . apparently there seems to be a correlation between development of good skills in the language and dropping out of teaching . could this be due to " culture shock " on return to australia of those who became most deeply immersed ? any references / ideas gratefully received . if there are a number , i could summarise . patrick mcconvell , anthropology , northrn territory university , po box 40146 , casuarina , nt 0811 , australia
3 changes: 3 additions & 0 deletions data/bare/part1/3-378msg3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: psycholinguistics teaching

for an undergraduate course i will shortly be teaching in psycholinguistics , i would appreciate any suggestions as to texts which other instructors have had good experiences with . also , i would be indebted if anyone can offer specific references to the work of helen neville on deaf alinguals and the acquisiton of asl . thanks m . klaiman ( klaiman @ umnacux . bitnet )
3 changes: 3 additions & 0 deletions data/bare/part1/3-378msg4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: german corpora

i am looking for on-line corpora of modern german . any information would be appreciated . ken beesley beesley . parc @ xerox . com
3 changes: 3 additions & 0 deletions data/bare/part1/3-378msg5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: t

hi , help ! i have to design an experiment to do with mandarin tones as part of a phonology requirement on my graduate course . there seems to be very little literature on this in the library . if anyone can think of any on-going debates on the phonology / phonetics of mandarin tones for which an experiment would be useful , please could you give me information and references . i would welcome any suggestions at all . thanks a lot , sophia wang . ( sophia @ ling . ed . ac . uk )
3 changes: 3 additions & 0 deletions data/bare/part1/3-379msg1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Subject: job - university of utah

the linguistics program at the university of utah invites applications for a one-year visiting assistant professor position to begin september , 1992 . minimum degree requirement is an a . b . d . candidates will be expected to teach an introductory undergraduate linguistics course and a course in american english for english teaching majors . they will also propose other undergraduate or m . a . level courses in general linguistics and sociolinguistics . send letter of application , curriculum vitae , sample publications , and three letters of reference to mauricio mixco , director , linguistics program , stewart building 213 , university of utah , salt lake city , ut 84112 . for further information you may telephone : 801-581 - 7432 or email dipaolo @ anthro . utah . edu .
Loading

0 comments on commit 3306468

Please sign in to comment.