Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
fxsjy committed Sep 29, 2012
0 parents commit 6f6e812
Show file tree
Hide file tree
Showing 5 changed files with 395,965 additions and 0 deletions.
22 changes: 22 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Auto detect text files and perform LF normalization
* text=auto

# Custom for Visual Studio
*.cs diff=csharp
*.sln merge=union
*.csproj merge=union
*.vbproj merge=union
*.fsproj merge=union
*.dbproj merge=union

# Standard to msysgit
*.doc diff=astextplain
*.DOC diff=astextplain
*.docx diff=astextplain
*.DOCX diff=astextplain
*.dot diff=astextplain
*.DOT diff=astextplain
*.pdf diff=astextplain
*.PDF diff=astextplain
*.rtf diff=astextplain
*.RTF diff=astextplain
163 changes: 163 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#################
## Eclipse
#################

*.pydevproject
.project
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.classpath
.settings/
.loadpath

# External tool builders
.externalToolBuilders/

# Locally stored "Eclipse launch configurations"
*.launch

# CDT-specific
.cproject

# PDT-specific
.buildpath


#################
## Visual Studio
#################

## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.

# User-specific files
*.suo
*.user
*.sln.docstates

# Build results
[Dd]ebug/
[Rr]elease/
*_i.c
*_p.c
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.vspscc
.builds
*.dotCover

## TODO: If you have NuGet Package Restore enabled, uncomment this
#packages/

# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opensdf
*.sdf

# Visual Studio profiler
*.psess
*.vsp

# ReSharper is a .NET coding add-in
_ReSharper*

# Installshield output folder
[Ee]xpress

# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html

# Click-Once directory
publish

# Others
[Bb]in
[Oo]bj
sql
TestResults
*.Cache
ClientBin
stylecop.*
~$*
*.dbmdl
Generated_Code #added for RIA/Silverlight projects

# Backup & report files from converting an old project file to a newer
# Visual Studio version. Backup files are not needed, because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML



############
## Windows
############

# Windows image file caches
Thumbs.db

# Folder config file
Desktop.ini


#############
## Python
#############

*.py[co]

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox

#Translations
*.mo

#Mr Developer
.mr.developer.cfg

# Mac crap
.DS_Store
60 changes: 60 additions & 0 deletions jieba/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
import math
import os,sys
import pprint

def gen_trie(f_name):
trie = {}
for line in open(f_name):
word,freq = line.strip().split(" ")
word = word.decode('utf-8')
p = trie
for c in word:
if not c in p:
p[c] ={}
p = p[c]
p['']='' #ending flag
return trie

_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
print >> sys.stderr, "loading dictionary..."
trie = gen_trie(os.path.join(_curpath,"dict.txt"))
print >> sys.stderr,"done."

def __cut(sentence):
N = len(sentence)
i,j=0,0
p = trie
while i<N:
c = sentence[j]
if c in p:
p = p[c]
if '' in p:
yield sentence[i:j+1]
j+=1
if j>=N:
i+=1
j=i
else:
p = trie
i+=1
j=i

def cut(sentence):
if not ( type(sentence) is unicode):
try:
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)

for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if x!="":
yield x
Loading

0 comments on commit 6f6e812

Please sign in to comment.