-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3306468
Showing
11,575 changed files
with
34,908 additions
and
0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
.DS_Store | ||
.pyc | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#!/usr/bin/python3 | ||
|
||
import os | ||
import glob | ||
import random | ||
from math import log10 | ||
from collections import defaultdict | ||
|
||
class NBClassifier: | ||
"""Implements a Naïve Bayes Classifier""" | ||
|
||
def __init__(self, **documents): | ||
""" | ||
Trains the classifier. | ||
@type documents: {str: list(str)} | ||
@param documents: class label -> [path_to_doc_1, path_to_doc_2, ...] | ||
""" | ||
self._classes = documents.keys() | ||
self._vocabulary = set() | ||
self._word_freqs = {c: defaultdict(int) for c in self._classes} | ||
self._logprior = defaultdict(float) | ||
self._loglikelihood = {c: defaultdict(float) for c in self._classes} | ||
# read word frequencies from training data | ||
num_docs_of_any_class = sum([len(d) for d in documents.values()]) | ||
for current_class, docs in documents.items(): | ||
for doc in docs: | ||
for word, frequency in self._get_word_freqs(doc).items(): | ||
self._word_freqs[current_class][word] += frequency | ||
self._vocabulary.add(word) | ||
for current_class in self._classes: | ||
num_docs_of_current_class = len(documents[current_class]) | ||
self._logprior[current_class] = log10(num_docs_of_current_class / num_docs_of_any_class) | ||
num_words = sum(self._word_freqs[current_class].values()) | ||
num_words += len(self.vocabulary()) # add-1 smoothing | ||
for word in self.vocabulary(): | ||
self._loglikelihood[current_class][word] = log10((self._word_freqs[current_class][word] + 1) / num_words) | ||
|
||
def classify(self, document): | ||
""" | ||
Returns the most likely class label for @param document. | ||
""" | ||
# helper: calculate log probability by class | ||
def _get_log_probability(current_class, document): | ||
log_probability = self._logprior[current_class] | ||
for word in self._get_words(document): | ||
if word in self.vocabulary(): # slow, but easy to understand | ||
log_probability += self._loglikelihood[current_class][word] | ||
return log_probability | ||
log_probabilities_per_class = {c: _get_log_probability(c, document) for c in self.classes()} | ||
return sorted(log_probabilities_per_class, key=log_probabilities_per_class.get, reverse=True)[0] | ||
|
||
def evaluate(self, **documents): | ||
""" | ||
Evaluates the classifier. | ||
@type documents: {str: list(str)} | ||
@param documents: class label -> [path_to_doc_1, path_to_doc_2, ...] | ||
Returns overall classification accuracy. | ||
""" | ||
num_items = 0 | ||
num_correct = 0 | ||
tp = defaultdict(int) # true positive | ||
fp = defaultdict(int) # false positive | ||
fn = defaultdict(int) # false negative | ||
for true_label, docs in documents.items(): | ||
for doc in docs: | ||
num_items += 1 | ||
predicted_label = self.classify(doc) | ||
if true_label == predicted_label: | ||
num_correct += 1 | ||
tp[true_label] += 1 | ||
else: | ||
fn[true_label] += 1 | ||
fp[predicted_label] += 1 | ||
# overall classification accuracy | ||
accuracy = num_correct / num_items | ||
print("Classifier accuracy: {:0.2f}%.".format(accuracy*100)) | ||
# precision, recall, f-measure per class | ||
for c in self.classes(): | ||
precision = tp[c] / (tp[c] + fp[c]) | ||
recall = tp[c] / (tp[c] + fn[c]) | ||
f1score = 2 * ((precision * recall) / (precision + recall)) | ||
print("Class {}:\n\t{:0.2f} precision\n\t{:0.2f} recall\n\t{:0.2f} F1-score".format( | ||
c, precision, recall, f1score | ||
)) | ||
|
||
def vocabulary(self): | ||
""" | ||
Returns the classifier's vocabulary. | ||
""" | ||
return self._vocabulary | ||
|
||
def classes(self): | ||
""" | ||
Returns the class labels this classifier can assign. | ||
""" | ||
return self._classes | ||
|
||
@staticmethod | ||
def _get_words(path_to_email): | ||
""" | ||
Reads an email stored at @param path_to_email. Returns the words it | ||
contains as a list. | ||
""" | ||
words = [] | ||
with open(path_to_email, 'r') as f: | ||
for line in f: | ||
for word in line.split(): | ||
words.append(word) | ||
return words | ||
|
||
@staticmethod | ||
def _get_word_freqs(path_to_email): | ||
""" | ||
Reads an email stored at @param path_to_email. Returns the words it | ||
contains, alongside their frequency. | ||
""" | ||
word_freqs = defaultdict(int) | ||
with open(path_to_email, 'r') as f: | ||
for line in f: | ||
for word in line.split(): | ||
word_freqs[word] += 1 | ||
return word_freqs | ||
|
||
|
||
if __name__ == "__main__": | ||
""" | ||
Trains a Naïve Bayes classifier on 9/10 of the ham and spam documents. Uses | ||
the remainder for evaluation. | ||
""" | ||
# read data, shuffle, and split into training and evaluation set | ||
docs = glob.glob('data/bare/*/*.txt') | ||
num_docs = len(docs) | ||
num_eval = int(num_docs / 10) | ||
num_train = num_docs - num_eval | ||
random.shuffle(docs) | ||
def format(docs): | ||
formatted_docs = {'ham': [], 'spam': []} | ||
for doc in docs: | ||
class_label = 'spam' if os.path.basename(doc).startswith('spmsg') else 'ham' | ||
formatted_docs[class_label].append(doc) | ||
return formatted_docs | ||
docs_eval = format(docs[:num_eval]) | ||
docs_train = format(docs[num_eval:]) | ||
print("Found {0} emails. Using {1} for training, {2} for evaluation." | ||
.format(num_docs, num_train, num_eval)) | ||
print("Training...") | ||
# train | ||
classifier = NBClassifier(**docs_train) | ||
# evaluate | ||
print("Evaluating...") | ||
classifier.evaluate(**docs_eval) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: re : 2 . 882 s - > np np | ||
|
||
> date : sun , 15 dec 91 02 : 25 : 02 est > from : michael < mmorse @ vm1 . yorku . ca > > subject : re : 2 . 864 queries > > wlodek zadrozny asks if there is " anything interesting " to be said > about the construction " s > np np " . . . second , > and very much related : might we consider the construction to be a form > of what has been discussed on this list of late as reduplication ? the > logical sense of " john mcnamara the name " is tautologous and thus , at > that level , indistinguishable from " well , well now , what have we here ? " . to say that ' john mcnamara the name ' is tautologous is to give support to those who say that a logic-based semantics is irrelevant to natural language . in what sense is it tautologous ? it supplies the value of an attribute followed by the attribute of which it is the value . if in fact the value of the name-attribute for the relevant entity were ' chaim shmendrik ' , ' john mcnamara the name ' would be false . no tautology , this . ( and no reduplication , either . ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: s - > np + np | ||
|
||
the discussion of s - > np + np reminds me that some years ago i read , in a source now forgotten , a critique of some newsmagazines ' unique tendencies in writing style , most of which the writer found overly " cute " . one item was tersely put down as follows : " time 's favorite : the colon . " - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - lee hartman ga5123 @ siucvmb . bitnet department of foreign languages southern illinois university carbondale , il 62901 u . s . a . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: 2 . 882 s - > np np | ||
|
||
. . . for me it 's much more restrictive than s - > np np . it 's " no " np pro quite an over-restriction , that . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: gent conference | ||
|
||
" for the listserv " international conference 1992 second circular : february 1992 literature and the analysis of discourse with special attention to the multicultural context tuesday 8 september - friday 11 september 1992 gent university , belgium writing and reading literature , oral literary traditions , dialogic text , non-literary narratives , discourse theory , literature as social practice , etc . , etc . , etc . keynote speakers : david birch ( murdoch , australia ) martin montgomery ( strathclyde , scotland ) elinor ochs ( los angeles , usa ) statement of pala ' s aims pala 's principal aim is to encourage cooperation between scholars and teachers interested in language and / or literary studies . the interests of pala members are wide , and this is reflected in papers given at pala conferences . interests of members include : stylistics , literary theory , the teaching of language and literature , critical linguistics , pragmatics , discours analysis , textual understanding , rhetoric , narratology , semiotic approaches to text and performance , sociolinguistics , cultural studies , post-structuralist theory ; in short , any theme which has relevance to the study and teaching of language and literature and their role in society . the 1992 conference theme to highlight the currently expanding field of discours studies , the 1992 conference has as its core theme ' literature and the analysis of discourse , with special attention to the multicultural context ' . papers covering interests as wide as the processes of writing and reading literature , the analysis of dialogic text , oral literary traditions , the relationship between literary and non-literary discourse , discourse theory and literary communication as social practice have all been proposed , as well as those dealing specifically with the writing and reading of literature in a multilingual and / or multicultural context . the 1992 conference venue gent university is of the city type ; there is no campus , and university buildings are dotted around the town . conference sessions will take place in the hoveniersberg , overlooking the bovenschelde in one of the quiet parts of town . programme conference sessions will start on the morning of the wednesday and last a full three days . it is envisaged that most participants will arrive and register on the tuesday evening . our provisional programme looks like this : tuedsday 8 sept 15 . 00 onwards : registration wednesday 9 sept 08 . 30 - 09 . 30 : late registration 09 . 45 : opening of conference 10 . 00 - 18 . 00 : conference sessions 18 . 30 : pre-booked dinner 20 . 15 : drinks reception thursday 10 sept 08 . 30 - 18 . 00 : conference sessions 18 . 30 : pala agm 20 . 00 : pre-booked dinner friday 11 sept 08 . 30 - 17 . 00 : conference sessions 17 . 15 : wind-up session evening : activities to be arranged there will be continuous coffee , tea , etc . throughout the conference sessions . accommodation rooms in the vermeylen student hall of residence , a couple of hundred metres from the conference centre , are available to all participants . it is possible to book rooms for several nights either side of the conference dates . the price on the registration form includes breakfast . unfortunately , no double rooms are available . if you would prefer to stay in a hotel , we recommend the arcade hotel ( nederkouter , 9000 gent ; tel . 32-91 - 25 . 07 . 07 ) , which is only 10 minutes ' walk from the conference centre . alternatively , you can contact the gent tourist office ( meersstraat 138 , 9000 gent ; tel . 32-91 - 25 . 35 . 55 ) . food breakfast will be served in the overpoort , the university eating complex next door to the vermeylen . lunch and supper is also available there to conference participants , as are snacks throughout the day . there will be no single ' conference dinner ' as such , but to make it easier for participants to meet each other , we are arranging dinners for both wednesday and thursday evenings in the university restaurant . these have to be pre-booked . staying in gent gent ( population around 230 , 000 ) is a historic flemish city , the first in europe to declare itself independent of feudal control . it has a plethora of medieval vistas and bridges and is thus entitled to compete with bruges and amsterdam for the title of ' venice of the north ' . it is also a busy industrial city and the commercial and administrative centre for east flanders . the first language is flemish / dutch ( depending on one 's sociolinguistic viewpoint ) but nearly every-body can use both english and french with at least some degree of fluency . there are numerous restaurants , cafes and pubs near the conference area ( including two good vegetarian restaurants ) , many of which stay open well into the small hours . prices are cheap by northern european standards . for those wishing to combine the conference with a visit to gent and the surrounding area , you may like to know that a train can take you in less than an hour to bruges , brussels , antwerp or the belgian coast . you can even get into the ardennes or to paris within a few hours . registration / queries to attend the conference , fill in the registration form and return it , with payment , by 1st may . confirmation of registration and details of arrangements will be sent in the third circular to those who have registered , but if you have any enquiries , contact jim o'driscoll or stef slembrouck at seminarie voor engelse taalkunde , universiteit gent , rozier 44 , b-9000 gent , belgium ( tel : 32-91 - 64 . 37 . 88 / 89 / 90 ; fax : 32-91 - 64 . 41 . 95 ; e-mail pala92 @ engllang . rug . ac . be ) . * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * pala 92 gent university registration form surname _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ first name ( s ) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ address _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ affiliation _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ i will participate in the conference and enclose a eurocheque ( or have arranged direct transfer to the pala account in belgium ) to cover : ( tick as appropriate ) pala member conference fee ( bf 1000 ) _ _ _ _ _ _ non-member conference fee ( bf 2000 ) _ _ _ _ _ _ student conference fee ( bf 600 ) _ _ _ _ _ _ dinner on 9th september ( bf 500 ) _ _ _ _ _ _ dinner on 10th september ( bf 500 ) _ _ _ _ _ _ accommodation for tue 8th september ( bf 525 ) _ _ _ _ _ _ accommodation for wed 9th september ( bf 525 ) _ _ _ _ _ _ accommodation for thu 10th september ( bf 525 ) _ _ _ _ _ _ accommodation for fri 11th september ( bf 525 ) _ _ _ _ _ _ accommodation for ( specify ) ( bf ) _ _ _ _ _ _ fee for international money transfer or cheque other than eurocheques * ( bf 300 ) _ _ _ _ _ _ i therefore enclose ( or have transferred ) a total of bf _ _ _ _ _ _ i would like lacto-vegetarian / vegan food for the dinner ( s ) i have booked _ _ _ _ _ signature _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ please return to pala conference 1992 , seminarie voor engelse taalkunde , universiteit gent , rozier 44 , b-9000 gent , belgium ( pala9 @ engllang . rug . ac . be ) . the final date for registration is 1st may 1992 . _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ * . note that all payments must be made in belgian francs . cheques should be made payable to ' pala conference 1992 ' . a single eurocheque must not be of more than bf 7 , 000 . international money transfers should be sent via ' swift ' , quoting our bank 's swift number ( bbru be bb 900 ) and our account number : bbl 390-0959358 - 83 . if you have any problems with either method of payment , please contact the organizers . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: query : causatives in korean | ||
|
||
could anyone point me to any books and articles about causative constructions in korean ? please send an e-mail directly to me . thanks you ! hiromi morikawa hiromi @ psych . stanford . edu |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: l2 learning / cultural empathy | ||
|
||
a graduate student in education approached a colleague of mine with a query which linguist people may be able to help with . he is doing an evaluation of an exchange program with indonesia one object of which is to prepare more high school teachers of indonesia here in australia . he wondered if there is anything written on the correlation of degree of acquisition of an l2 with degree of empathy and participation in the culture of the l2 speakers . high involvement / skills might seem to be a good thing for teachers in both areas ; however negative aspects might be there . apparently there seems to be a correlation between development of good skills in the language and dropping out of teaching . could this be due to " culture shock " on return to australia of those who became most deeply immersed ? any references / ideas gratefully received . if there are a number , i could summarise . patrick mcconvell , anthropology , northrn territory university , po box 40146 , casuarina , nt 0811 , australia |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: psycholinguistics teaching | ||
|
||
for an undergraduate course i will shortly be teaching in psycholinguistics , i would appreciate any suggestions as to texts which other instructors have had good experiences with . also , i would be indebted if anyone can offer specific references to the work of helen neville on deaf alinguals and the acquisiton of asl . thanks m . klaiman ( klaiman @ umnacux . bitnet ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: german corpora | ||
|
||
i am looking for on-line corpora of modern german . any information would be appreciated . ken beesley beesley . parc @ xerox . com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: t | ||
|
||
hi , help ! i have to design an experiment to do with mandarin tones as part of a phonology requirement on my graduate course . there seems to be very little literature on this in the library . if anyone can think of any on-going debates on the phonology / phonetics of mandarin tones for which an experiment would be useful , please could you give me information and references . i would welcome any suggestions at all . thanks a lot , sophia wang . ( sophia @ ling . ed . ac . uk ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Subject: job - university of utah | ||
|
||
the linguistics program at the university of utah invites applications for a one-year visiting assistant professor position to begin september , 1992 . minimum degree requirement is an a . b . d . candidates will be expected to teach an introductory undergraduate linguistics course and a course in american english for english teaching majors . they will also propose other undergraduate or m . a . level courses in general linguistics and sociolinguistics . send letter of application , curriculum vitae , sample publications , and three letters of reference to mauricio mixco , director , linguistics program , stewart building 213 , university of utah , salt lake city , ut 84112 . for further information you may telephone : 801-581 - 7432 or email dipaolo @ anthro . utah . edu . |
Oops, something went wrong.