-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGAMI_PostReconcileCleaningFunctions.R
173 lines (147 loc) · 6.17 KB
/
GAMI_PostReconcileCleaningFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# GAMI
# Post-reconciliation cleaning
# August 2020
# arsiders
# read in reconciled data
#csvFileName <- paste("GAMI_Reconciled_Cleaned",Sys.Date(),".csv",sep="")
#write.csv(data4, file=csvFileName)
# ====== REMOVE BOOK CHAPTERS (post-reconciliation)
# if summary includes the word "book" or "chapter" remove from database
bookchap <- function(data){
chapters <- NA
for (i in 1:nrow(data)){
summary <- unlist(strsplit(data[i,"Summarize"],split=' ', fixed=TRUE))
if (("book" %in% summary) || ("chapter" %in% summary)){
chapters <- c(chapters,data[i,1])
}
}
chapters<-chapters[-1]
chapters<- as.vector(unique(chapters, incomparables = FALSE))
print(c("Number book chapters IDs",length(chapters)))
chaps <- data[data$Article.ID %in% chapters,]
data2 <- setdiff(data,chaps)
return(data2)
}
# ====== Remove entries with "False" for Include or Sufficient
includesuff <- function(data){
removeID <- NA
data[,7] <- as.logical(data[,7])
data[,4] <- as.logical(data[,4])
for (i in 1:nrow(data)){
if ((data[i,7] == FALSE) || (data[i,4] == FALSE)||(is.na(data[i,4]))){
removeID <- c(removeID, data[i,1])
}
}
removeID <- removeID[-1]
removeID <- as.vector(unique(removeID, incomparables = FALSE))
notsuff <- data[data$Article.ID %in% removeID,]
print(c("Number insufficient IDs",length(removeID)))
data2 <- setdiff(data,notsuff)
return(data2)
}
# ====== Fixing the HEALTH/CITIES typo (post-reconciliation)
# Problem: Decide which "Cities settlements & key infrastructure; Health" labels refer to cities and which to health
healthcities <- function(data){
library(stringr)
library(dplyr)
# category A: cities code, no well-being ---> A&B: search for health or city/infrastructure-related words to decide labels
# category B: cities code and well-being
# category C: well-being code, no cities ---> assume articles relate to health; no modifications necessary
healthwords <- c("health","morbidity", "sanitation", "healthy","disease","mortality")
citywords <- c("cities","city","urban","metropolitan","towns","Taiwan",
"transport","transportation","infrastructure","building","buildings","construction",
"protection","water","coastal","defense","built","physical","flood","flooding")
# used most frequent words from city-labeled Summaries, Response Types, and Implementation Tools
# to create the citywords list
#subset out Cities-coded articles
d <- data %>% filter(str_detect(data[,10],"Cities settlements & key infrastructure"))
data2 <- setdiff(data,d)
print(c("Cities tagged articles",nrow(d)))
#subset those that include well-being
# d.well <- d %>% filter(str_detect(d[,10],"well-being & communities")) # category B
# print(c("Cities and well-being tagged articles",nrow(d.well)))
#take the difference - those only cities
# d.cities<-setdiff(d,d.well) # category A
# print(c("Cities only tagged articles",nrow(d.cities)))
#i=12
for (i in 1:nrow(d)){
# identify all sectors
sectors <- unlist(strsplit(d[i,"1.2.Sector"],split='|||', fixed=TRUE))
sectors <- trimws(sectors)
# identify exposure-vulnerabilities
exposures <- unlist(strsplit(d[i,"3.4.1.Exposure.vulnerability"],split='|||', fixed=TRUE))
exposures <- as.factor(trimws(exposures))
# identify cross-cutting topics
topics <- unlist(strsplit(d[i,"1.3.Cross.cutting.topics"],split='|||', fixed=TRUE))
exposures <- as.factor(trimws(topics))
# break out Summary words
summ <- unlist(strsplit(d[i,"Summarize"],split=' ', fixed=TRUE))
# if Exposure includes "Health & well-being" OR if Summary includes health words,
# add well-being to sectors (doesn't remove well-being if already there)
if ((any(exposures=="Health & wellbeing")) || (any(healthwords %in% summ))){
sectors <- c(sectors,"well-being & communities")
# remove duplicate if introduced
sectors <- as.vector(unique(sectors, incomparables = FALSE))
d[i,"1.2.Sector"] <- paste(sectors, collapse = "|||")
}
# if Exposure includes "Work and economic growth"
# OR if Summary includes city or infrastructure words
# OR if cross-cutting topics mention cities by the sea
# keep City label: else remove
if ((any(exposures=="Work and economic growth")) || (any(citywords %in% summ))
|| (any(topics=="Cities and settlements by the sea"))){
sectors <- c(sectors,"Cities settlements & key infrastructure")
# remove duplicate if introduced
sectors <- as.vector(unique(sectors, incomparables = FALSE))
d[i,"1.2.Sector"] <- paste(sectors, collapse = "|||")
} else {
city<-("Cities settlements & key infrastructure; Health")
sectors<-(sectors[-(which(sectors==city))])
d[i,"1.2.Sector"] <- paste(sectors, collapse = "|||")
}
}
# replace labels in Excel
# d2<-gsub("well-being & communities","Health, well-being & communities",d)
# d3<-gsub("Cities settlements & key infrastructure; Health","Cities settlements & key infrastructure",d2)
data2<- rbind(data2,d)
return(data2)
}
# ===== MOST FREQUENT WORDS IN SUMMARY, RESPONSE, AND IMPLEMENTATION TO DECIDE ON "CITY WORDS"
# library(tm)
#
# Corpus = Corpus(VectorSource(d$Summarize))
# Corpus
# params = list(stopwords=T, wordLengths=c(3,20))
# dtm = DocumentTermMatrix(Corpus,control=params)
# matrix = as.matrix(dtm)
# dim(matrix)
# freq <- colSums(matrix)
# freq <- sort(freq, decreasing=TRUE)
#
# library(wordcloud)
# words <- names(freq)
# wordcloud(words[1:200], freq[1:200])
#
#
# RCorpus = Corpus(VectorSource(d$`3.1.2.Response-QUOTES`))
# Corpus
# params = list(stopwords=T, wordLengths=c(3,20))
# dtm = DocumentTermMatrix(RCorpus,control=params)
# matrix = as.matrix(dtm)
# dim(matrix)
# freq <- colSums(matrix)
# freq <- sort(freq, decreasing=TRUE)
# words <- names(freq)
# wordcloud(words[1:200], freq[1:200])
#
#
# ICorpus = Corpus(VectorSource(d$`3.2.2.ImpTools-QUOTES`))
# ICorpus
# params = list(stopwords=T, wordLengths=c(3,20))
# dtm = DocumentTermMatrix(ICorpus,control=params)
# matrix = as.matrix(dtm)
# dim(matrix)
# freq <- colSums(matrix)
# freq <- sort(freq, decreasing=TRUE)
# words <- names(freq)
# wordcloud(words[1:200], freq[1:200])