-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcws.bib
1347 lines (1234 loc) · 64.1 KB
/
cws.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{zhang_type-supervised_2014,
address = {Gothenburg, Sweden},
title = {Type-Supervised Domain Adaptation for Joint Segmentation and {POS-Tagging}},
url = {http://www.aclweb.org/anthology/E14-1062},
urldate = {2014-07-25},
booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics},
publisher = {Association for Computational Linguistics},
author = {Zhang, Meishan and Zhang, Yue and Che, Wanxiang and Liu, Ting},
month = apr,
year = {2014},
pages = {588–597},
file = {Zhang+ 2014.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\7EAJWU43\\Zhang 等. - 2014 - Type-Supervised Domain Adaptation for Joint Segmen.pdf:application/pdf}
}
@article{qiu_automatic_2014,
title = {Automatic Corpus Expansion for Chinese Word Segmentation by Exploiting the Redundancy of Web Information},
url = {http://www.aclweb.org/anthology/C14-1109},
urldate = {2014-08-20},
journal = {{COOLING} 2014},
author = {Qiu, Xipeng and Huang, ChaoChao and Huang, Xuanjing},
year = {2014},
annote = {拿不准的到网上去搜索},
file = {[PDF] from aclweb.org:E\:\\Dropbox\\Others\\zotero\\storage\\JP29B6GG\\Qiu 等. - Automatic Corpus Expansion for Chinese Word Segmen.pdf:application/pdf}
}
@inproceedings{zhang_exploring_2013,
address = {Seattle, Washington, {USA}},
title = {Exploring Representations from Unlabeled Data with Co-training for Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/D13-1031},
urldate = {2013-12-03},
booktitle = {Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing},
publisher = {Association for Computational Linguistics},
author = {Zhang, Longkai and Wang, Houfeng and Sun, Xu and Mansur, Mairgup},
month = oct,
year = {2013},
pages = {311–321},
file = {Zhang+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\HKW9A2R7\\Zhang et al. - 2013 - Exploring Representations from Unlabeled Data with.pdf:application/pdf}
}
@inproceedings{zhang_improving_2013,
address = {Sofia, Bulgaria},
title = {Improving Chinese Word Segmentation on Micro-blog Using Rich Punctuations},
url = {http://www.aclweb.org/anthology/P13-2032},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
publisher = {Association for Computational Linguistics},
author = {Zhang, Longkai and Li, Li and He, Zhengyan and Wang, Houfeng and Sun, Ni},
month = aug,
year = {2013},
pages = {177–182},
file = {Zhang+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\3SQP2B5I\\Zhang et al. - 2013 - Improving Chinese Word Segmentation on Micro-blog .pdf:application/pdf}
}
@inproceedings{zeng_graph-based_2013,
address = {Sofia, Bulgaria},
title = {Graph-based Semi-Supervised Model for Joint Chinese Word Segmentation and Part-of-Speech Tagging},
url = {http://www.aclweb.org/anthology/P13-1076},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Zeng, Xiaodong and Wong, Derek F. and Chao, Lidia S. and Trancoso, Isabel},
month = aug,
year = {2013},
pages = {770–779},
file = {Zeng+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\76SWQ76I\\Zeng et al. - 2013 - Graph-based Semi-Supervised Model for Joint Chines.pdf:application/pdf}
}
@inproceedings{zeng_co-regularizing_2013,
address = {Sofia, Bulgaria},
title = {Co-regularizing character-based and word-based models for semi-supervised Chinese word segmentation},
url = {http://www.aclweb.org/anthology/P13-2031},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
publisher = {Association for Computational Linguistics},
author = {Zeng, Xiaodong and Wong, Derek F. and Chao, Lidia S. and Trancoso, Isabel},
month = aug,
year = {2013},
pages = {171–176},
file = {Zeng+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\DCFF4QW7\\Zeng et al. - 2013 - Co-regularizing character-based and word-based mod.pdf:application/pdf}
}
@incollection{wu_text_2013,
title = {Text Window Denoising Autoencoder: Building Deep Architecture for Chinese Word Segmentation},
shorttitle = {Text Window Denoising Autoencoder},
url = {http://link.springer.com/chapter/10.1007/978-3-642-41644-6_1},
urldate = {2013-12-03},
booktitle = {Natural Language Processing and Chinese Computing},
publisher = {Springer},
author = {Wu, Ke and Gao, Zhiqiang and Peng, Cheng and Wen, Xiao},
year = {2013},
pages = {1–12},
file = {10.1007-978-3-642-41644-6_1.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\UQD5JQ45\\10.1007-978-3-642-41644-6_1.pdf:application/pdf;Snapshot:E\:\\Dropbox\\Others\\zotero\\storage\\N4PXNSB2\\978-3-642-41644-6_1.html:text/html}
}
@inproceedings{wang_lattice-based_2013,
address = {Sofia, Bulgaria},
title = {A Lattice-based Framework for Joint Chinese Word Segmentation, {POS} Tagging and Parsing},
url = {http://www.aclweb.org/anthology/P13-2110},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
publisher = {Association for Computational Linguistics},
author = {Wang, Zhiguo and Zong, Chengqing and Xue, Nianwen},
month = aug,
year = {2013},
pages = {623–627},
file = {Wang+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\CS2S2IZZ\\Wang et al. - 2013 - A Lattice-based Framework for Joint Chinese Word S.pdf:application/pdf}
}
@inproceedings{wang_mining_2013,
address = {Sofia, Bulgaria},
title = {Mining Informal Language from Chinese Microtext: Joint Word Recognition and Segmentation},
shorttitle = {Mining Informal Language from Chinese Microtext},
url = {http://www.aclweb.org/anthology/P13-1072},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Wang, Aobo and Kan, Min-Yen},
month = aug,
year = {2013},
pages = {731–741},
file = {Wang & Kan 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\NCR43BK3\\Wang 和 Kan - 2013 - Mining Informal Language from Chinese Microtext J.pdf:application/pdf}
}
@inproceedings{jiang_discriminative_2013,
address = {Sofia, Bulgaria},
title = {Discriminative Learning with Natural Annotations: Word Segmentation as a Case Study},
shorttitle = {Discriminative Learning with Natural Annotations},
url = {http://www.aclweb.org/anthology/P13-1075},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Jiang, Wenbin and Sun, Meng and Lü, Yajuan and Yang, Yating and Liu, Qun},
month = aug,
year = {2013},
pages = {761–769},
file = {Jiang+ 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\EK8FX6CM\\Jiang et al. - 2013 - Discriminative Learning with Natural Annotations .pdf:application/pdf}
}
@inproceedings{hagiwara_accurate_2013,
address = {Sofia, Bulgaria},
title = {Accurate Word Segmentation using Transliteration and Language Model Projection},
url = {http://www.aclweb.org/anthology/P13-2033},
urldate = {2013-08-06},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
publisher = {Association for Computational Linguistics},
author = {Hagiwara, Masato and Sekine, Satoshi},
month = aug,
year = {2013},
pages = {183–189},
file = {Hagiwara & Sekine 2013.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\HTH64K8T\\Hagiwara 和 Sekine - 2013 - Accurate Word Segmentation using Transliteration a.pdf:application/pdf}
}
@phdthesis{__2012,
title = {使用压缩表示的中文分词词性标注研究},
school = {清华大学},
author = {张, 开旭},
year = {2012},
}
@inproceedings{sun_fast_2012,
address = {Jeju Island, Korea},
title = {Fast Online Training with Frequency-Adaptive Learning Rates for Chinese Word Segmentation and New Word Detection},
url = {http://www.aclweb.org/anthology/P12-1027},
urldate = {2012-07-24},
booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Sun, Xu and Wang, Houfeng and Li, Wenjie},
month = jul,
year = {2012},
pages = {253–262},
annote = {{基于CRF的改进}
{引入新Feature:基于word} unigram和word bigram的,基于两个标签的复杂特征
引入新的权重更新算法,步长与特征频度有关,高频特征步长衰减快},
file = {Sun+ 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\26S4ZKME\\Sun et al. - 2012 - Fast Online Training with Frequency-Adaptive Learn.pdf:application/pdf}
}
@inproceedings{sun_reducing_2012,
address = {Jeju Island, Korea},
title = {Reducing Approximation and Estimation Errors for Chinese Lexical Processing with Heterogeneous Annotations},
url = {http://www.aclweb.org/anthology/P12-1025},
urldate = {2012-07-24},
booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Sun, Weiwei and Wan, Xiaojun},
month = jul,
year = {2012},
pages = {232–241},
file = {Sun & Wan 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\BCQHNZME\\Sun 和 Wan - 2012 - Reducing Approximation and Estimation Errors for C.pdf:application/pdf}
}
@inproceedings{sun_capturing_2012,
address = {Jeju Island, Korea},
title = {Capturing Paradigmatic and Syntagmatic Lexical Relations: Towards Accurate Chinese Part-of-Speech Tagging},
shorttitle = {Capturing Paradigmatic and Syntagmatic Lexical Relations},
url = {http://www.aclweb.org/anthology/P12-1026},
urldate = {2012-07-24},
booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Sun, Weiwei and Uszkoreit, Hans},
month = jul,
year = {2012},
pages = {242–252},
file = {Sun & Uszkoreit 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\V69XSEDK\\Sun 和 Uszkoreit - 2012 - Capturing Paradigmatic and Syntagmatic Lexical Rel.pdf:application/pdf}
}
@inproceedings{qian_joint_2012,
address = {Jeju Island, Korea},
title = {Joint Chinese Word Segmentation, {POS} Tagging and Parsing},
url = {http://www.aclweb.org/anthology/D12-1046},
urldate = {2012-07-24},
booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
publisher = {Association for Computational Linguistics},
author = {Qian, Xian and Liu, Yang},
month = jul,
year = {2012},
pages = {501–511},
annote = {分词、词性标注、句法分析三个模型分别训练
然后解码的时候再合起来},
file = {Qian & Liu 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\KSIDXTSX\\Qian 和 Liu - 2012 - Joint Chinese Word Segmentation, POS Tagging and P.pdf:application/pdf}
}
@inproceedings{li_unified_2012,
address = {Jeju Island, Korea},
title = {Unified Dependency Parsing of Chinese Morphological and Syntactic Structures},
url = {http://www.aclweb.org/anthology/D12-1132},
urldate = {2012-07-24},
booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
publisher = {Association for Computational Linguistics},
author = {Li, Zhongguo and Zhou, Guodong},
month = jul,
year = {2012},
pages = {1445–1454},
file = {Li & Zhou 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\V645P37I\\Li 和 Zhou - 2012 - Unified Dependency Parsing of Chinese Morphologica.pdf:application/pdf}
}
@inproceedings{li_integrating_2012,
address = {Mumbai, India},
title = {Integrating Surface and Abstract Features for Robust Cross-Domain Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/C12-1101},
booktitle = {Proceedings of {COLING} 2012},
publisher = {The {COLING} 2012 Organizing Committee},
author = {Li, Xiaoqing and Wang, Kun and Zong, Chengqing and Su, Keh-Yih},
month = dec,
year = {2012},
pages = {1653–1670},
file = {C12-1101.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\5GZT9V6S\\C12-1101.pdf:application/pdf}
}
@inproceedings{li_active_2012,
address = {Mumbai, India},
title = {Active Learning for Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/C12-2067},
booktitle = {Proceedings of {COLING} 2012: Posters},
publisher = {The {COLING} 2012 Organizing Committee},
author = {Li, Shoushan and Zhou, Guodong and Huang, Chu-Ren},
month = dec,
year = {2012},
pages = {683–692},
file = {C12-2067.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\W7X9QT8P\\C12-2067.pdf:application/pdf}
}
@inproceedings{jiang_iterative_2012,
address = {Jeju Island, Korea},
title = {Iterative Annotation Transformation with Predict-Self Reestimation for Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/D12-1038},
urldate = {2012-07-24},
booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
publisher = {Association for Computational Linguistics},
author = {Jiang, Wenbin and Meng, Fandong and Liu, Qun and Lü, Yajuan},
month = jul,
year = {2012},
pages = {412–420},
file = {Jiang+ 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\9JEIXMU9\\Jiang et al. - 2012 - Iterative Annotation Transformation with Predict-S.pdf:application/pdf}
}
@inproceedings{hatori_incremental_2012,
address = {Jeju Island, Korea},
title = {Incremental Joint Approach to Word Segmentation, {POS} Tagging, and Dependency Parsing in Chinese},
url = {http://www.aclweb.org/anthology/P12-1110},
urldate = {2012-07-24},
booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
publisher = {Association for Computational Linguistics},
author = {Hatori, Jun and Matsuzaki, Takuya and Miyao, Yusuke and Tsujii, Jun'ichi},
month = jul,
year = {2012},
pages = {1045–1053},
file = {Hatori+ 2012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\VVTJ9R7I\\Hatori et al. - 2012 - Incremental Joint Approach to Word Segmentation, P.pdf:application/pdf}
}
@inproceedings{duan_cips-sighan_2012,
address = {Tianjin, China},
title = {The {CIPS-SIGHAN} {CLP} 2012 {ChineseWord} Segmentation {onMicroBlog} Corpora Bakeoff},
url = {http://www.aclweb.org/anthology/W12-6307},
booktitle = {Proceedings of the Second {CIPS-SIGHAN} Joint Conference on Chinese Language Processing},
publisher = {Association for Computational Linguistics},
author = {Duan, Huiming and Sui, Zhifang and Tian, Ye and Li, Wenjie},
month = dec,
year = {2012},
pages = {35–40}
}
@article{zhang_syntactic_2011,
title = {Syntactic Processing using the Generalized Perceptron and Beam Search},
number = {Early Access},
journal = {Computational Linguistics},
author = {Zhang, Y. and Clark, S.},
year = {2011},
keywords = {{CL}, perceptron},
pages = {1--47},
annote = {之前工作的总结。
将平均感知器,应用于汉语的词法分析、句法分析。
使用beam search。},
file = {J11-1005.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\5VUSJ55Z\\J11-1005.pdf:application/pdf}
}
@inproceedings{wang_improving_2011,
address = {Chiang Mai, Thailand},
title = {Improving Chinese Word Segmentation and {POS} Tagging with Semi-supervised Methods Using Large Auto-Analyzed Data},
url = {http://www.aclweb.org/anthology/I11-1035},
booktitle = {Proceedings of 5th International Joint Conference on Natural Language Processing},
publisher = {Asian Federation of Natural Language Processing},
author = {Wang, Yiou and Kazama, Jun'ichi and Tsuruoka, Yoshimasa and Chen, Wenliang and Zhang, Yujie and Torisawa, Kentaro},
month = nov,
year = {2011},
pages = {309–317},
file = {I11-1035.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\GCPGXMC7\\I11-1035.pdf:application/pdf}
}
@article{wang_new_2011,
title = {A New Unsupervised Approach to Word Segmentation},
number = {Just Accepted},
journal = {Computational Linguistics},
author = {Wang, H. and Zhu, J. and Tang, S. and Fan, X.},
year = {2011},
keywords = {{CL}, unsupervised},
pages = {1–48},
file = {J11-3001.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\JTSKXP9F\\J11-3001.pdf:application/pdf}
}
@inproceedings{sun_stacked_2011,
address = {Portland, Oregon, {USA}},
title = {A Stacked Sub-Word Model for Joint Chinese Word Segmentation and Part-of-Speech Tagging},
url = {http://www.aclweb.org/anthology/P11-1139},
urldate = {2011-06-17},
booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics},
author = {Sun, Weiwei},
month = jun,
year = {2011},
keywords = {{ACL}, stacked},
pages = {1385–1394},
annote = {使用stacked learning这种meta-learning algorithm,有机制避免两层在训练时使用重叠的训练数据,但也能最大限度利用数据。
第一层使用了三个模型,基于词的,基于字序列标注的,基于单字分类的。},
file = {P11-1139.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\8MWJ725K\\P11-1139.pdf:application/pdf}
}
@inproceedings{sun_enhancing_2011,
address = {Edinburgh, Scotland, {UK.}},
title = {Enhancing Chinese Word Segmentation Using Unlabeled Data},
url = {http://www.aclweb.org/anthology/D11-1090},
urldate = {2011-08-04},
booktitle = {Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing},
publisher = {Association for Computational Linguistics},
author = {Sun, Weiwei and Xu, Jia},
month = jul,
year = {2011},
keywords = {{EMNLP}, semi-supervised},
pages = {970–979},
annote = {feature engineering,使用in-domain的未标注数据帮助中文分词。
{增加的特征有:互信息;Accessor} Variety;基于标点符号的特征;篇章级的特征。
另外一个结论是使用实数值作为特征值不如用binary的。},
file = {Sun & Xu 2011.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\ZN9PGR2V\\Sun and Xu - 2011 - Enhancing Chinese Word Segmentation Using Unlabele.pdf:application/pdf}
}
@inproceedings{li_parsing_2011,
address = {Portland, Oregon, {USA}},
title = {Parsing the Internal Structure of Words: A New Paradigm for Chinese Word Segmentation},
shorttitle = {Parsing the Internal Structure of Words},
url = {http://www.aclweb.org/anthology/P11-1141},
urldate = {2011-06-17},
booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
publisher = {Association for Computational Linguistics},
author = {Li, Zhongguo},
month = jun,
year = {2011},
keywords = {{ACL}, parsing},
pages = {1405–1414},
annote = {将词法分析与句法分析结合。在同一棵树下使用不同的“成分”标签。
使用句法分析的算法解码。},
file = {P11-1141.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\9DMUAUZF\\P11-1141.pdf:application/pdf}
}
@inproceedings{zhao_cips-sighan_2010,
title = {The {CIPS-SIGHAN} {CLP2010} Chinese Word Segmentation Backoff},
booktitle = {{CIPS-SIGHAN} Joint Conference on Chinese Language Processing},
author = {Zhao, Hongmei and Liu, Qun},
year = {2010}
}
@inproceedings{zhang_fast_2010,
address = {Cambridge, {MA}},
title = {A Fast Decoder for Joint Word Segmentation and {POS-Tagging} Using a Single Discriminative Model},
url = {http://www.aclweb.org/anthology/D10-1082},
booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing},
publisher = {Association for Computational Linguistics},
author = {Zhang, Yue and Clark, Stephen},
month = oct,
year = {2010},
pages = {843–852},
annote = {解码速度从每秒2.24句,提高到每秒24.94就},
file = {emnlp10yue.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\3C75ZFUB\\emnlp10yue.pdf:application/pdf}
}
@article{zhang_local_2010,
title = {A Local Generative Model for Chinese Word Segmentation},
journal = {Information Retrieval Technology},
author = {Zhang, K. and Sun, M. and Xue, P.},
year = {2010},
pages = {420--431},
annote = {提出一种用局部的语言模型做分词的方法。
{提出一种构造切分二叉树的方法,处理分词粒度问题,该方法也可直接利用CRF的输出构造二叉树。}},
file = {fulltext.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\SUGAIZG4\\fulltext.pdf:application/pdf;Snapshot:E\:\\Dropbox\\Others\\zotero\\storage\\GC53NMIV\\60u60u58k06m426p.html:text/html}
}
@inproceedings{xiao_joint_2010,
address = {Beijing, China},
title = {Joint Tokenization and Translation},
url = {http://www.aclweb.org/anthology/C10-1135},
booktitle = {Proceedings of the 23rd International Conference on Computational Linguistics (Coling 2010)},
publisher = {Coling 2010 Organizing Committee},
author = {Xiao, Xinyan and Liu, Yang and Hwang, YoungSook and Liu, Qun and Lin, Shouxun},
month = aug,
year = {2010},
pages = {1200–1208},
file = {C10-1135.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\RFRTHEI5\\C10-1135.pdf:application/pdf}
}
@inproceedings{wang_character-based_2010,
address = {Beijing, China},
title = {A Character-Based Joint Model for Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/C10-1132},
booktitle = {Proceedings of the 23rd International Conference on Computational Linguistics (Coling 2010)},
publisher = {Coling 2010 Organizing Committee},
author = {Wang, Kun and Zong, Chengqing and Su, Keh-Yih},
month = aug,
year = {2010},
pages = {1173–1181},
annote = {整合一个产生式模型和判别式模型
另外发现将某些binary特征值的权重改一下,可以提高效果。},
file = {2010.08 COLING-wangkun.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\JSS9PX3J\\2010.08 COLING-wangkun.pdf:application/pdf}
}
@inproceedings{qian_joint_2010,
title = {Joint training and decoding using virtual nodes for cascaded segmentation and tagging tasks},
booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing},
author = {Qian, X. and Zhang, Q. and Zhou, Y. and Huang, X. and Wu, L.},
year = {2010},
keywords = {{EMNLP}},
pages = {187–195},
file = {D10-1019.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\87DEKU63\\D10-1019.pdf:application/pdf}
}
@inproceedings{_-_2009,
title = {基于字依存树的中文词法-句法一体化分析},
booktitle = {中国计算机语言学研究前沿进展 (2007-2009)},
author = {赵, 海 and 揭, 春雨 and 宋, 彦},
year = {2009},
file = {基于字依存树的中文词法_句法一体化分析.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\ZIQKTMVJ\\基于字依存树的中文词法_句法一体化分析.pdf:application/pdf}
}
@phdthesis{__2009,
title = {基于 {CRFs} 的中文分词和短文本分类技术},
author = {滕, 少华},
year = {2009},
annote = {{就分词来说,用Chi方做特征选择,一半的特征仍然可以保持性能。}
个别字(如“的”,“和”,“了”)的有无对整句切分的正确性有帮助与干扰。
{使用CRF的置信度输出,低置信度产生高错误率。}
基于规则的、基于篇章上下文统计的低置信度后处理过程。},
file = {thesis.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\EMD8MJVG\\thesis.pdf:application/pdf}
}
@inproceedings{zhao_character-level_2009,
address = {Athens, Greece},
title = {Character-Level Dependencies in Chinese: Usefulness and Learning},
url = {http://www.aclweb.org/anthology/E09-1100},
booktitle = {Proceedings of the 12th Conference of the European Chapter of the {ACL} ({EACL} 2009)},
publisher = {Association for Computational Linguistics},
author = {Zhao, Hai},
month = mar,
year = {2009},
pages = {879–887},
annote = {用字的依存树做分词。
最后系统,词内是词法字依存关系,词之间是线性依存关系。
当然最终效果没有现有最优系统好。},
file = {E09-1100.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\XXR2ZEU4\\E09-1100.pdf:application/pdf}
}
@inproceedings{zhao_simple_2009,
title = {A Simple and Efficient Model Pruning Method for Conditional Random Fields},
publisher = {Springer},
author = {Zhao, H. and Kit, C.},
year = {2009},
pages = {145--155},
annote = {{CRF训练后,按参数值去掉大部分特征,性能都不会下降,用事实证明CRF有太多冗余。}},
file = {PruneCRFs-20090107-ICCPOL09-final.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\T2GM3AQD\\PruneCRFs-20090107-ICCPOL09-final.pdf:application/pdf}
}
@article{tsai_chinese_2009,
title = {Chinese text segmentation: A hybrid approach using transductive learning and statistical association measures},
journal = {Expert Systems with Applications},
author = {Tsai, R. T. H.},
year = {2009},
annote = {{多种加入各种特征提高CRF性能的方法。}},
file = {sdarticle-1.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\GG9CIJ26\\sdarticle-1.pdf:application/pdf}
}
@inproceedings{mochihashi_bayesian_2009,
title = {Bayesian Unsupervised Word Segmentation with Nested Pitman-Yor Language Modeling},
booktitle = {Proceedings of the Joint Conference of the 47th Annual Meeting of the {ACL} and the 4th International Joint Conference on Natural Language Processing of the {AFNLP}},
author = {Mochihashi, Daichi and Yamada, Takeshi and Ueda, Naonori},
year = {2009},
keywords = {{ACL}, unsupervised},
pages = {100–108},
annote = {{用Pitman-Yor,建立了两层语言模型,一个是词的,一个是} 句子的。},
file = {P09-1012.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\KNTWK2T7\\P09-1012.pdf:application/pdf}
}
@article{li_punctuation_2009,
title = {Punctuation as Implicit Annotations for Chinese Word Segmentation},
volume = {35},
number = {4},
journal = {Computational Linguistics},
author = {Li, Zhongguo and Sun, Maosong},
year = {2009},
keywords = {{CL}, {ME}, semi-supervised},
pages = {505--512},
file = {coli.2009.35.4.35403.lowlink.pdf_v03.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\56DRVTTK\\coli.2009.35.4.35403.lowlink.pdf_v03.pdf:application/pdf}
}
@inproceedings{kruengkrai_error-driven_2009,
address = {Suntec, Singapore},
title = {An Error-Driven Word-Character Hybrid Model for Joint Chinese Word Segmentation and {POS} Tagging},
url = {http://www.aclweb.org/anthology/P/P09/P09-1058},
booktitle = {Proc. of {ACL-IJCNLP} 2009},
publisher = {Association for Computational Linguistics},
author = {Kruengkrai, Canasai and Uchimoto, Kiyotaka and Kazama, Jun'ichi and Wang, Yiou and Torisawa, Kentaro and Isahara, Hitoshi},
year = {2009},
keywords = {{ACL}},
pages = {513–521},
annote = {词典词与生词分别对待},
file = {P09-1058.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\UV5HTZU4\\P09-1058.pdf:application/pdf}
}
@inproceedings{jiang_automatic_2009,
address = {Suntec, Singapore},
title = {Automatic Adaptation of Annotation Standards: Chinese Word Segmentation and {POS} Tagging – A Case Study},
url = {http://www.aclweb.org/anthology/P/P09/P09-1059},
booktitle = {Proceedings of the 47th {ACL}},
publisher = {Association for Computational Linguistics},
author = {Jiang, Wenbin and Huang, Liang and Liu, Qun},
month = aug,
year = {2009},
keywords = {{ACL}, perceptron},
pages = {522–530},
annote = {Perceptron,分词与词性标注结合。将一种标注体系下的参数,转移到另一种标注体系中使用。},
file = {P09-1059.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\CDPGIDHM\\P09-1059.pdf:application/pdf}
}
@inproceedings{zhao_unsupervised_2008,
title = {Unsupervised segmentation helps supervised learning of character tagging for word segmentation and named entity recognition},
booktitle = {The Sixth {SIGHAN} Workshop on Chinese Language Processing},
author = {Zhao, Hai and Kit, Chunyu},
year = {2008},
pages = {106–111},
annote = {将accessor variety ({AV)的结果离散化,然后分散到字,给为CRF的输入,可以提高分词效果。}},
file = {I08-4017.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\M9VWFCWI\\I08-4017.pdf:application/pdf}
}
@inproceedings{zhao_empirical_2008,
title = {An Empirical Comparison of Goodness Measures for Unsupervised Chinese Word Segmentation with a Unified Framework},
booktitle = {The Third International Joint Conference on Natural Language Processing ({IJCNLP-2008)}, Hyderabad, India},
author = {Zhao, Hai and Kit, Chunyu},
year = {2008},
annote = {{描述了四种用于无监督中文分词的判别量:Frequency} of Substring with {ReductionDescription} Length Gain ({DLG)Accessor} Variety ({AV)Boundary} Entropy (Branching Entropy, {BE)}},
file = {I08-1002.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\4K9T24X9\\I08-1002.pdf:application/pdf}
}
@inproceedings{zhang_joint_2008,
address = {Columbus, Ohio},
title = {Joint Word Segmentation and {POS} Tagging Using a Single Perceptron},
url = {http://www.aclweb.org/anthology/P/P08/P08-1101},
booktitle = {Proceedings of {ACL-08:} {HLT}},
publisher = {Association for Computational Linguistics},
author = {Zhang, Yue and Clark, Stephen},
month = jun,
year = {2008},
pages = {888–896},
file = {url.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\TV26HWGD\\url.pdf:application/pdf}
}
@inproceedings{xu_bayesian_2008,
title = {Bayesian semi-supervised chinese word segmentation for statistical machine translation},
booktitle = {Proceedings of the 22nd International Conference on Computational Linguistics-Volume 1},
publisher = {Association for Computational Linguistics},
author = {Xu, J. and Gao, J. and Toutanova, K. and Ney, H.},
year = {2008},
keywords = {{COLING}},
pages = {1017--1024},
file = {C08-1128.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\IFEN978A\\C08-1128.pdf:application/pdf}
}
@inproceedings{qiao_statistical_2008,
title = {Statistical Properties of Overlapping Ambiguities in Chinese Word Segmentation and a Strategy for Their Disambiguation},
booktitle = {Text, Speech and Dialogue},
publisher = {Springer},
author = {Qiao, W. and Sun, M. and Menzel, W.},
year = {2008},
pages = {177--186},
file = {21.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\A6M5D3VX\\21.pdf:application/pdf}
}
@inproceedings{liu_information_2008,
title = {Information retrieval oriented word segmentation based on character associative strength ranking},
booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
publisher = {Association for Computational Linguistics},
author = {Liu, Y. and Wang, B. and Ding, F. and Xu, S.},
year = {2008},
keywords = {{EMNLP}},
pages = {1061--1069},
annote = {{用了RankingSVM的方法分词,用于IR}},
file = {D08-1111.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\E3BV5RVM\\D08-1111.pdf:application/pdf;D08-1111.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\PJVB3N7N\\D08-1111.pdf:application/pdf}
}
@inproceedings{jin_fourth_2008,
title = {The Fourth International Chinese Language Processing Bakeoff: Chinese Word Segmentation, Named Entity Recognition and Chinese {POS} Tagging},
booktitle = {Proceedings of the Sixth {SIGHAN} Workshop on Chinese Language Processing},
author = {Jin, Guangjin and Chen, Xiao},
year = {2008}
}
@inproceedings{jiang_word_2008,
address = {Manchester, {UK}},
title = {Word Lattice Reranking for Chinese Word Segmentation and Part-of-Speech Tagging},
url = {http://www.aclweb.org/anthology/C08-1049},
booktitle = {Proceedings of the 22nd International Conference on Computational Linguistics (Coling 2008)},
publisher = {Coling 2008 Organizing Committee},
author = {Jiang, Wenbin and Mi, Haitao and Liu, Qun},
month = aug,
year = {2008},
pages = {385–392},
annote = {使用reranking。有别于top-n的reranking,使用指数规模的word lattice reranking。至少看oracle,后者比前者就好。
解决的问题有:如何构造lattice,如何算oracle,有哪些特征,以及reranking的时候的cube剪枝。},
file = {C08-1049.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\ER5ZXIG9\\C08-1049.pdf:application/pdf}
}
@inproceedings{jiang_cascaded_2008,
address = {Columbus, Ohio},
title = {A Cascaded Linear Model for Joint Chinese Word Segmentation and Part-of-Speech Tagging},
url = {http://www.aclweb.org/anthology/P/P08/P08-1102},
booktitle = {Proceedings of {ACL-08:} {HLT}},
publisher = {Association for Computational Linguistics},
author = {Jiang, Wenbin and Huang, Liang and Liu, Qun and Lü, Yajuan},
month = jun,
year = {2008},
pages = {897–904},
file = {P08-1102.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\D6D9TK2G\\P08-1102.pdf:application/pdf}
}
@article{__2007,
title = {基于有效子串标注的中文分词},
volume = {21},
number = {005},
journal = {中文信息学报},
author = {赵, 海 and 揭, 春雨},
year = {2007},
pages = {8--13},
file = {基于有效子串标注的中文分词.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\JTK394VB\\基于有效子串标注的中文分词.pdf:application/pdf}
}
@article{__2007-1,
title = {中文分词十年回顾},
volume = {21},
number = {003},
journal = {中文信息学报},
author = {黄, 昌宁 and 赵, 海},
year = {2007},
pages = {8–19},
annote = {中文词的认同度。从863、973到sig {han评测。语料库的质量控制(包括对“心理词”的规则制定)。基于语法的、基于规则的不如基于词的,又被基于字的取代。大规模真实文本中未登录词造成的分词精度失落比歧义切分造成的精度失落至少大5倍以上。基于字的,最大熵,SVM,CRF等。词位转移,2标注,4标注,微软的6标注。5字窗口足够了。}},
file = {中文分词十年回顾.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\DBHJZ9BU\\中文分词十年回顾.pdf:application/pdf}
}
@inproceedings{zhang_chinese_2007,
address = {Prague, Czech Republic},
title = {Chinese Segmentation with a Word-Based Perceptron Algorithm},
url = {http://www.aclweb.org/anthology/P/P07/P07-1106},
publisher = {Association for Computational Linguistics},
author = {Zhang, Yue and Clark, Stephen},
month = jun,
year = {2007},
keywords = {{ACL}},
pages = {840--847},
annote = {采用average perceptron,然后用一种lazy update的方法。
采用了基于词的特征,所以解码使用柱搜索,而不能用贪心或者动态规划。},
file = {P07-1106.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\SXMSM8UQ\\P07-1106.pdf:application/pdf;P07-1106.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\GQ6KETAD\\P07-1106.pdf:application/pdf}
}
@inproceedings{shi_dual-layer_2007,
title = {A dual-layer {CRFs} based joint decoding method for cascaded segmentation and labeling tasks},
volume = {7},
booktitle = {Proceedings of {IJCAI}},
author = {Shi, Y. and Wang, M.},
year = {2007},
keywords = {{IJCAI}},
pages = {1707--1712},
annote = {{双层CRF做分词与词性标注,中规中矩。}
第一层基于字信息分词;第二层基于词,以及字信息标注词性。
{两层CRF分开训练,联合测试。第一层找N-best,再综合第一层第二层的结果重新排序。}},
file = {IJCAI07-276.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\KBRZKT5Z\\IJCAI07-276.pdf:application/pdf}
}
@inproceedings{nakagawa_hybrid_2007,
title = {A hybrid approach to word segmentation and pos tagging},
volume = {45},
booktitle = {{ANNUAL} {MEETING-ASSOCIATION} {FOR} {COMPUTATIONAL} {LINGUISTICS}},
author = {Nakagawa, Tetsuji and Uchimoto, Kiyotaka},
year = {2007},
pages = {2},
annote = {{字与词结合的Lattice,然后分词与标注结合。仍然用马尔可夫模型}},
file = {P07-2055.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\ZHCMGXP6\\P07-2055.pdf:application/pdf}
}
@inproceedings{huang_rethinking_2007,
title = {Rethinking Chinese word segmentation: tokenization, character classification, or wordbreak identification},
booktitle = {Proceedings of the 45th Annual Meeting of the {ACL} on Interactive Poster and Demonstration Sessions},
publisher = {Association for Computational Linguistics Morristown, {NJ}, {USA}},
author = {Huang, Chu-Ren and Simon, Petr and Hsieh, Shu-Kai and Prévot, L.},
year = {2007},
keywords = {{ACL}},
pages = {69--72},
annote = {不使用字标注,直接关心字间间隔(断开与不断开)。
使用滑动窗口的方法进行判断。},
file = {HKPolyU070411.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\GK9TZS89\\HKPolyU070411.pdf:application/pdf;P07-2018.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\IFQWMT45\\P07-2018.pdf:application/pdf}
}
@article{__2006,
title = {汉语词典的快速查询算法研究},
journal = {中文信息学报},
author = {李, 江波 and 周, 强 and 陈, 祖舜},
year = {2006},
annote = {{双数组Trie数是相当高效的词典查询算法,适合中文分词。简单说是逐字哈希,而哈希函数是平凡的f(x)=x,而且不会有冲突。所以很快。但维护双数组也很难。}}
}
@inproceedings{zhao_improved_2006,
title = {An improved Chinese word segmentation system with conditional random field},
booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on Chinese Language Processing},
publisher = {Sydney: July},
author = {Zhao, H. and Huang, C. N. and Li, M.},
year = {2006},
pages = {162–165},
annote = {6-tag settone featureassistant segmenters },
file = {CSB-SIGHAN5_20071015-rev.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\C87THDC2\\CSB-SIGHAN5_20071015-rev.pdf:application/pdf}
}
@inproceedings{zhang_subword-based_2006,
address = {Sydney, Australia},
title = {Subword-Based Tagging for Confidence-Dependent Chinese Word Segmentation},
url = {http://www.aclweb.org/anthology/P/P06/P06-2123},
booktitle = {Proceedings of the {COLING/ACL} 2006 Main Conference Poster Sessions},
publisher = {Association for Computational Linguistics},
author = {Zhang, Ruiqiang and Kikui, Genichiro and Sumita, Eiichiro},
month = jul,
year = {2006},
pages = {961–968},
annote = {subword-based tagging, 比如北京市 标注为 北京/l 市/r
不过还是用的三标注系统
{使用CRF中的置信度,与基于词典的方法融合}
{CRF倾向于较高的OOV的F1,而较低的IV的F1}},
file = {N06-2049.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\4KBGBZVR\\N06-2049.pdf:application/pdf}
}
@inproceedings{li_discriminative_2006,
title = {Discriminative pruning of language models for Chinese word segmentation},
booktitle = {Proceedings of the 21st International Conference on Computational Linguistics and the 44th annual meeting of the Association for Computational Linguistics},
publisher = {Association for Computational Linguistics},
author = {Li, J. and Wang, H. and Ren, D. and Li, G.},
year = {2006},
pages = {1008},
file = {P06-1126.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\36S6QKRR\\P06-1126.pdf:application/pdf}
}
@inproceedings{levow_third_2006,
address = {Sydney, Australia},
title = {The Third International Chinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition},
url = {http://www.aclweb.org/anthology/W/W06/W06-0115},
booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on Chinese Language Processing},
publisher = {Association for Computational Linguistics},
author = {Levow, Gina-Anne},
month = jul,
year = {2006},
pages = {108–117}
}
@inproceedings{jin_unsupervised_2006,
address = {Sydney, Australia},
title = {Unsupervised Segmentation of Chinese Text by Use of Branching Entropy},
url = {http://www.aclweb.org/anthology/P/P06/P06-2056},
booktitle = {Proceedings of the {COLING/ACL} 2006 Main Conference Poster Sessions},
publisher = {Association for Computational Linguistics},
author = {Jin, Zhihui and Tanaka-Ishii, Kumiko},
month = jul,
year = {2006},
pages = {428–435},
annote = {如nature,随着字母的读入,nature后面跟的字母的不确定性比natur大得多,所以认为前者是一个可能的词边界。
论文中以此为基础,算出句子每个子序列的边界熵(前向后先两个方向)以此为判据,禁欲},
file = {P06-2056.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\KIS3H56R\\P06-2056.pdf:application/pdf}
}
@inproceedings{goldwater_contextual_2006,
title = {Contextual Dependencies in Unsupervised Word Segmentation},
booktitle = {Proceedings of the 21st International Conference on Computational Linguistics and the 44th annual meeting of the Association for Computational Linguistics},
publisher = {Association for Computational Linguistics},
author = {Goldwater, Sharon and Griffiths, Thomas L. and Johnson, Mark},
year = {2006},
pages = {680},
annote = {{基于D过程的语言模型与词法模型两个词两个词的Gibbs采样}},
file = {P06-1085.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\W8ZFPVTH\\P06-1085.pdf:application/pdf}
}
@article{__2005,
title = {现代汉语语料库建设及深加工},
volume = {2},
url = {http://www.corpus4u.org/forum/upload/forum/2005072322202796.pdf},
urldate = {2012-08-17},
journal = {语言文字应用},
author = {靳光瑾 and 肖航 and 富丽 and 章云帆},
year = {2005},
pages = {111–120},
annote = {国家语委的语料库介绍},
file = {[PDF] from corpus4u.org:E\:\\Dropbox\\Others\\zotero\\storage\\H4ZDBP95\\靳光瑾 et al. - 2005 - 现代汉语语料库建设及深加工.pdf:application/pdf}
}
@inproceedings{tseng_conditional_2005,
title = {A conditional random field word segmenter for sighan bakeoff 2005},
booktitle = {Proceedings of the Fourth {SIGHAN} Workshop on Chinese Language Processing},
publisher = {Jeju Island, Korea},
author = {Tseng, H. and Chang, P. and Andrew, G. and Jurafsky, D. and Manning, C.},
year = {2005},
pages = {168--171},
annote = {{SIGHAN} bakekoff 2005 中相当好的一个系统
{加了简单的词缀和叠字的feature在CRF里面}},
file = {I05-3027.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\XE28MPSH\\I05-3027.pdf:application/pdf}
}
@inproceedings{li_perceptron_2005,
title = {Perceptron Learning for Chinese Word Segmentation},
booktitle = {Proceedings of Fourth {SIGHAN} Workshop on Chinese Language processing (Sighan-05)},
author = {Li, Y. and Miao, C. and Bontcheva, K. and Cunningham, H.},
year = {2005},
pages = {154–157},
file = {I05-3023.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\GG6F67ST\\I05-3023.pdf:application/pdf}
}
@article{gao_chinese_2005,
title = {Chinese Word Segmentation and Named Entity Recognition: A Pragmatic Approach},
volume = {31},
shorttitle = {Chinese Word Segmentation and Named Entity Recognition},
url = {http://dx.doi.org/10.1162/089120105775299177},
doi = {10.1162/089120105775299177},
abstract = {This article presents a pragmatic approach to Chinese word segmentation. It differs from most previous approaches mainly in three respects. First, while theoretical linguists have defined Chinese words using various linguistic criteria, Chinese words in this study are defined pragmatically as segmentation units whose definition depends on how they are used and processed in realistic computer applications. Second, we propose a pragmatic mathematical framework in which segmenting known words and detecting unknown words of different types (i.e., morphologically derived words, factoids, named entities, and other unlisted words) can be performed simultaneously in a unified way. These tasks are usually conducted separately in other systems. Finally, we do not assume the existence of a universal word segmentation standard that is application-independent. Instead, we argue for the necessity of multiple segmentation standards due to the pragmatic fact that different natural language processing applications might require different granularities of Chinese words.},
number = {4},
urldate = {2009-03-04},
journal = {Computational Linguistics},
author = {Gao, Jianfeng and Li, Mu and Huang, Chang-Ning and Wu, Andi},
month = dec,
year = {2005},
keywords = {{CL}, perceptron},
pages = {531--574},
annote = {使用perceptron学习线性模型与基于字标注不同,解码前构造word lattice。相当于事先缩小了可能的字标注结果集合的大小。将词分为若干类,每一类会按概率计算一些概率值,作为perceptron的参数。perceptron的参数全是非binary的。只有词类的trigram的概率,不涉及任何具体字。},
file = {089120105775299177.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\U342JTWV\\089120105775299177.pdf:application/pdf}
}
@inproceedings{emerson_second_2005,
title = {The second international chinese word segmentation bakeoff},
booktitle = {Proceedings of the Fourth {SIGHAN} Workshop on Chinese Language Processing},
publisher = {Jeju Island, Korea},
author = {Emerson, Thomas},
year = {2005},
pages = {123--133},
file = {I05-3017.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\JUU4DEM5\\I05-3017.pdf:application/pdf}
}
@article{duan_statistic_2005,
title = {A Statistic Study of Three-character Unknown Words in Chinese},
volume = {15},
number = {2},
journal = {Journal of Chinese Language and Computing},
author = {Duan, ZWXZH},
year = {2005},
pages = {113--123},
file = {JCLC_V15_N2_5.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\NER3EJ4J\\JCLC_V15_N2_5.pdf:application/pdf}
}
@inproceedings{chen_unigram_2005,
title = {Unigram language model for Chinese word segmentation},
url = {http://acl.ldc.upenn.edu/I/I05/I05-3019.pdf},
urldate = {2012-10-22},
booktitle = {Proceedings of the 4th {SIGHAN} Workshop on Chinese Language Processing},
author = {Chen, A. and Zhou, Y. and Zhang, A. and Sun, G.},
year = {2005},
pages = {138–141},
file = {Full Text:E\:\\Dropbox\\Others\\zotero\\storage\\I7GTIX2H\\Chen et al. - 2005 - Unigram language model for Chinese word segmentati.pdf:application/pdf}
}
@inproceedings{asahara_combination_2005,
title = {Combination of machine learning methods for optimum chinese word segmentation},
url = {http://acl.ldc.upenn.edu/I/I05/I05-3018.pdf},
urldate = {2012-10-22},
booktitle = {Proc. Fourth {SIGHAN} Workshop on Chinese Language Processing},
author = {Asahara, M. and Fukuoka, K. and Azuma, A. and Goh, C. L. and Watanabe, Y. and Matsumoto, Y. and Tsuzuki, T.},
year = {2005},
pages = {134–137},
file = {Full Text:E\:\\Dropbox\\Others\\zotero\\storage\\MQFQ2Z3A\\Asahara et al. - 2005 - Combination of machine learning methods for optimu.pdf:application/pdf}
}
@article{__2004,
title = {基于无指导学习策略的无词表条件下的汉语自动分词},
volume = {27},
number = {006},
journal = {计算机学报},
author = {孙, 茂松 and 肖, 明 and 邹, 嘉彦},
year = {2004},
pages = {736--742},
annote = {使用互信息与t测试差当作两个判据以字为单位进行无监督分词。以字算的标注准确度可到85\%左右。},
file = {基于词频统计的中文分词的研究.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\TBAU7EVF\\基于词频统计的中文分词的研究.pdf:application/pdf;基于无指导学习策略的无词表条件下的汉语自动分词.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\B6K4UGEW\\基于无指导学习策略的无词表条件下的汉语自动分词.pdf:application/pdf}
}
@inproceedings{peng_chinese_2004,
address = {Geneva, Switzerland},
title = {Chinese Segmentation and New Word Detection using Conditional Random Fields},
booktitle = {Proceedings of Coling 2004},
publisher = {{COLING}},
author = {Peng, Fuchun and Feng, Fangfang and McCallum, Andrew},
month = aug,
year = {2004},
pages = {562–568},
annote = {{将CRF引入中文分词}},
file = {C04-1081.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\Z95MIN2M\\C04-1081.pdf:application/pdf}
}
@inproceedings{ng_chinese_2004,
address = {Barcelona, Spain},
title = {Chinese Part-of-Speech Tagging: One-at-a-Time or All-at-Once? Word-Based or Character-Based?},
booktitle = {Proceedings of {EMNLP} 2004},
publisher = {Association for Computational Linguistics},
author = {Ng, Hwee Tou and Low, Jin Kiat},
editor = {Lin, Dekang and Wu, Dekai},
month = jul,
year = {2004},
pages = {277–284},
annote = {用最大熵模型试了三种方法,分开做分词与标注或者同时做,词性标注用基于字的特征或者用基于词的特征:
同时的基于字的最好,但是时间慢很多。
分开基于字的稍差,但快很多。
分开基于词的,分词性能当然与基于字的一样,但词性标注差很多,总时间快一点。词性标注差是因为词之中的字对确定词性很重要。
没有同时而且基于词的,估计是因为机器跑不动。也没有实验在分词阶段用基于词的特征。},
file = {Ng.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\KDHXD98B\\Ng.pdf:application/pdf}
}
@inproceedings{kudo_applying_2004,
title = {Applying conditional random fields to Japanese morphological analysis},
volume = {2004},
booktitle = {Proc. of {EMNLP}},
author = {Kudo, T. and Yamamoto, K. and Matsumoto, Y.},
year = {2004},
annote = {{用改造过的CRF模型做日文分词。以词为单位,即y长度与x不一定相等。}},
file = {W04-3230.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\IEFZXRIT\\W04-3230.pdf:application/pdf}
}
@inproceedings{gao_adaptive_2004,
title = {Adaptive Chinese word segmentation},
booktitle = {Proceedings of {ACL-2004}},
author = {Gao, J. and Wu, A. and Li, M. and Huang, C. N. and Li, H. and Xia, X. and Qin, H.},
year = {2004},
file = {133_pdf_2-col.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\AC3KXQNK\\133_pdf_2-col.pdf:application/pdf}
}
@article{feng_unsupervised_2004,
title = {Unsupervised segmentation of Chinese corpus using accessor variety},
journal = {Natural Language Processing {IJCNLP} 2004},
author = {Feng, Haodi and Chen, Kang and Kit, Chunyu and Deng, Xiaotie},
year = {2004},
pages = {694--703},
annote = {{如何用Accessor} variety 构造一个分词器。如何设计目标函数。},
file = {download.pdf:E\:\\Dropbox\\Others\\zotero\\storage\\RTTCFJH3\\download.pdf:application/pdf}
}
@article{feng_accessor_2004,
title = {Accessor variety criteria for Chinese word extraction},
volume = {30},