-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.R
78 lines (61 loc) · 3.03 KB
/
sentiment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# load in data
teaword <- read.csv('C:/Users/yhy/Desktop/自闭症/原始数据、分词/teaword.csv')
tea <- read.csv('C:/Users/yhy/Desktop/自闭症/原始数据、分词/tea.csv')
# 一、词典导入与处理
### 1.情感正向词,词组+打“+1”-label
pos <- read.table("C:/Users/yhy/Desktop/自闭症/情感语料包/CNKI词语集/posCN.txt",encoding='UTF-8')
pos$V1<- trimws(pos$V1, which = "right")
weight <- rep(1, length(pos[,1]))
pos <- cbind(pos, weight)
### 2.情感负向词,词组+打“-1”-label
neg <- read.table("C:/Users/yhy/Desktop/自闭症/情感语料包/CNKI词语集/negCN.txt",encoding='UTF-8')
neg$V1 <- trimws(neg$V1, which = "right")
weight <- rep(-1, length(neg[,1]))
neg <- cbind(neg, weight)
### 3.正、负向词组合并
posneg <- rbind(pos, neg) #正负词典合并
names(posneg) <- c("term", "weight")
posneg <- posneg[!duplicated(posneg$term), ] #`duplicated`函数的作用和`unique`函数比较相似,它返回重复项的位置编号
# 二、情感得分
### 1.关联情感权重
testtea <- dplyr::left_join(teaword,posneg,by=c('word'='term'))
nonsense <- testtea[is.na(testtea$weight),]
test <- testtea[!is.na(testtea$weight),]
### 2.计算情感指数
dictresult <- aggregate(weight ~ document, data = testtea,sum)
table(dictresult$weight)
### 3.情感得分匹配给tea数据,生成最终进入模型数据result
result <- dplyr::left_join(tea,dictresult,by=c('Id'='document'))
sum(is.na(result$weight))
length(result$Id)
write.csv(result,'C:/Users/yhy/Desktop/自闭症/原始数据、分词/result.csv')
result <- read.csv('C:/Users/yhy/Desktop/自闭症/原始数据、分词/result.csv')
# 方法2:词语表
senti <- openxlsx::read.xlsx('C:/Users/yhy/Desktop/自闭症/情感语料包/情感词汇本体_大连理工大学信息检索/情感词汇本体.xlsx')
### 1.情感类型重新赋值
table(senti$情感分类)
senti$sentitype <- as.numeric(car::recode(senti$情感分类,"'PA'=1;'PE'=1;'PD'=1;
'PH'=1;'PG'=1;'PB'=1;'PK'=1;'NF'=-1;
'NB'=-1;'NJ'=-1;'NH'=-1;'PF'=-1;
'NI'=-1;'NC'=-1;'NG'=-1;'NE'=-1;'ND'=-1;
'NN'=-1;'NK'=-1;'NL'=-1;'PC'=-1"))
sum(is.na(senti$sentitype))
table(senti$sentitype)
senti$weight <- senti$sentitype*senti$强度
### 2.关联情感权重
testtea2 <- dplyr::left_join(teaword,senti[c('词语','weight')],by=c('word'='词语'))
nonsense2 <- testtea2[is.na(testtea2$weight),]
test2 <- testtea2[!is.na(testtea2$weight),]
### 3.计算情感指数
dictresult2 <- aggregate(weight ~ document, data = test2,sum)
table(dictresult2$weight)
sum(dictresult2$weight<0)
sum(dictresult2$weight>0)
library(ggplot2)
p1 <- ggplot(aes(x = weight), data = dictresult2) + geom_freqpoly()
p1
### 3.情感得分匹配给tea数据,生成最终进入模型数据result
result2 <- dplyr::left_join(tea,dictresult2,by=c('Id'='document'))
sum(!is.na(result2$weight))
length(result2$Id)
write.csv(result2,'C:/Users/yhy/Desktop/自闭症/原始数据、分词/result2.csv')