-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfenci.R
52 lines (45 loc) · 1.9 KB
/
fenci.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# 调入分词的库
library("rJava")
library("Rwordseg")
# 调入绘制词云的库
library("RColorBrewer")
library("wordcloud")
# 读入数据(特别注意,read.csv竟然可以读取txt的文本)
myfile <- read.csv(file.choose(), header = FALSE)
# 预处理,这步可以将读入的文本转换为可以分词的字符,没有这步不能分词
myfile.res <- myfile[myfile != " "]
# 分词,并将分词结果转换为向量
setAnalyzer("jiebaR")
myfile.words <- unlist(lapply(X = myfile.res, FUN = segmentCN))
# 剔除URL等各种不需要的字符,还需要删除什么特殊的字符可以依样画葫芦在下面增加gsub的语句
myfile.words <- gsub(pattern = "http:[a-zA-Z\\/\\.0-9]+", "", myfile.words)
myfile.words <- gsub("\n", "", myfile.words)
myfile.words <- gsub(" ", "", myfile.words)
# 去掉停用词
data_stw <- read.table(file = file.choose(), colClasses = "character")
stopwords_CN <- c(NULL)
for (i in 1:dim(data_stw)[1]) {
stopwords_CN <- c(stopwords_CN, data_stw[i, 1])
}
for (j in 1:length(stopwords_CN)) {
myfile.words <- subset(myfile.words, myfile.words != stopwords_CN[j])
}
# 过滤掉1个字的词
myfile.words <- subset(myfile.words, nchar(as.character(myfile.words)) > 1)
# 统计词频
myfile.freq <- table(unlist(myfile.words))
myfile.freq <- rev(sort(myfile.freq))
myfile.freq <- data.frame(word = names(myfile.freq), freq = myfile.freq)
# 按词频过滤词,过滤掉只出现过一次的词,这里可以根据需要调整过滤的词频数
# 特别提示:此处注意myfile.freq$Freq大小写
myfile.freq2 <- subset(myfile.freq, myfile.freq$Freq >= 10)
# 绘制词云
# 设置一个颜色系:
mycolors <- brewer.pal(8, "Dark2")
# 设置字体
windowsFonts(myFont = windowsFont("微软雅黑"))
# 画图
wordcloud(myfile.freq2$word, myfile.freq2$Freq,
min.freq = 10, max.words = Inf, random.order = FALSE,
random.color = FALSE, colors = mycolors, family = "myFont"
)