-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtext2vec_RegEx.R
32 lines (25 loc) · 1.09 KB
/
text2vec_RegEx.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# library(RPTT)
# listPageUrls = getListPageUrls("Gossiping")[1:5]
# postUrls = unlist(lapply(listPageUrls,getPostUrls))
# postData = lapply(postUrls, function(url) try(getPostData(url),TRUE))
# error_idx = which(sapply(postData, function(x) class(x)) == "try-error")
# data_idx = which(sapply(postData, function(x) class(x)) != "try-error")
# data = postData[data_idx]
# postDf = data.frame(do.call(rbind,lapply(postData[data_idx],function(xx) xx$postData )))
# pushDf = do.call(rbind,lapply(postData[data_idx],function(xx) xx$pushDf))
unlist(postDf$Title)[1]
msg = unlist(postDf$Title)[1]
RegExPattern = "[\u4E00-\u9FA5]"
matches = gregexpr(RegExPattern,msg)
paste(unlist(regmatches(msg,matches)),collapse = " ")
regExToken = function(msg, RegExPattern = "[\u4E00-\u9FA5]"){
matches = gregexpr(RegExPattern,msg)
paste(unlist(regmatches(msg,matches)),collapse = " ")
}
messages = unlist(postDf$Title)
segRes = lapply(messages,regExToken)
library(tm)
tmWordsVec = unlist(segRes)
corpus <- Corpus(VectorSource(tmWordsVec))
tdm = TermDocumentMatrix(corpus,control = list(wordLengths = c(1, Inf)))
inspect(tdm)