-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
276 lines (250 loc) · 8.93 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#include <iostream>
#include <string>
#include <curl/curl.h>
#include <regex>
#include <fstream>
#include <vector>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
#include <iomanip> // for std::setw and std::setfill
struct QnA
{
std::string question;
std::string answer;
std::string source;
};
// Assume WriteCallback is defined above...
size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
((std::string *)userp)->append((char *)contents, size * nmemb);
return size * nmemb;
}
// Define a function to escape JSON special characters
std::string escape_json(const std::string &s)
{
std::ostringstream o;
for (auto c = s.cbegin(); c != s.cend(); c++)
{
switch (*c)
{
case '"':
o << "\\\"";
break;
case '\\':
o << "\\\\";
break;
case '\b':
o << "\\b";
break;
case '\f':
o << "\\f";
break;
case '\n':
o << "\\n";
break;
case '\r':
o << "\\r";
break;
case '\t':
o << "\\t";
break;
default:
if ('\x00' <= *c && *c <= '\x1f')
{
o << "\\u"
<< std::hex << std::setw(4) << std::setfill('0') << (int)*c;
}
else
{
o << *c;
}
}
}
return o.str();
}
std::string http_get(const std::string &url)
{
CURL *curl = curl_easy_init();
std::string response;
if (curl)
{
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
curl_easy_perform(curl); // Perform the request
curl_easy_cleanup(curl); // Always cleanup
}
return response;
}
std::vector<std::string> parse_and_extract_answer_and_link(const std::string &html_content)
{
std::string answer;
std::string source_link;
std::vector<std::string> result;
// Parse the HTML content into a DOM
htmlDocPtr doc = htmlReadDoc((xmlChar *)html_content.c_str(), NULL, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
// If the document is parsed without errors
if (doc)
{
// Create a context for XPath evaluation
xmlXPathContextPtr context = xmlXPathNewContext(doc);
if (context)
{
// Evaluate XPath expression to find the answer within div#qna_only
xmlXPathObjectPtr answer_result = xmlXPathEvalExpression((xmlChar *)"//div[@id='qna_only']//text()", context);
if (answer_result)
{
xmlNodeSetPtr nodes = answer_result->nodesetval;
for (int i = 0; i < nodes->nodeNr; ++i)
{
xmlNodePtr cur = nodes->nodeTab[i];
xmlChar *content = xmlNodeGetContent(cur);
if (content)
{
answer += (char *)content;
xmlFree(content);
}
}
xmlXPathFreeObject(answer_result);
}
// Evaluate XPath expression to find the source link within div.original_source
xmlXPathObjectPtr link_result = xmlXPathEvalExpression((xmlChar *)"//div[@class='original_source']/a/@href", context);
if (link_result)
{
xmlNodeSetPtr nodes = link_result->nodesetval;
if (nodes->nodeNr > 0)
{
xmlChar *content = xmlNodeGetContent(nodes->nodeTab[0]);
if (content)
{
source_link = (char *)content;
xmlFree(content);
}
}
xmlXPathFreeObject(link_result);
}
// Clean up
xmlXPathFreeContext(context);
}
xmlFreeDoc(doc);
}
// Append answer and source link to the result vector
result.push_back(answer);
result.push_back(source_link);
// Return the result vector
return result;
}
int main()
{
LIBXML_TEST_VERSION
CURL *curl;
CURLcode res;
std::string responseString;
std::vector<std::string> questions; // Use a vector to store questions
std::vector<QnA> qna_pairs;
// More specific regex to match only the contents of the <a> tag within <h2> tags
// std::regex question_regex("<h2><a[^>]*>(.*?)</a></h2>");
std::regex question_regex("<h2><a[^>]+href=\"([^\"]+)\"[^>]*>([^<]+)</a></h2>");
std::regex page_count_regex("Pg 1 of (\\d+)"); // Regex to find the total number of pages
int total_pages = 0;
curl = curl_easy_init();
if (curl)
{
std::string page_count_url = "https://islamqa.org/category/maliki/";
curl_easy_setopt(curl, CURLOPT_URL, page_count_url.c_str());
// Set up other CURL options here...
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &responseString);
res = curl_easy_perform(curl);
if (res == CURLE_OK)
{
std::smatch page_count_match;
if (std::regex_search(responseString, page_count_match, page_count_regex) && page_count_match.size() > 1)
{
total_pages = std::stoi(page_count_match[1].str());
std::cout << "Total pages found: " << total_pages << std::endl;
}
responseString.clear();
}
else
{
std::cerr << "Failed to get total page count: " << curl_easy_strerror(res) << std::endl;
return -1; // Exit if we can't get the total page count
}
int questionCount = 0;
for (int page = 1; page <= total_pages; ++page)
{
std::cout << "Page Number: " << page << std::endl;
std::string url;
if (page == 1)
{
url = "https://islamqa.org/category/maliki/";
}
else
{
url = "https://islamqa.org/category/maliki/page/" + std::to_string(page) + "/";
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
// Set up other CURL options here...
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &responseString);
// Set up other options such as CURLOPT_WRITEDATA, CURLOPT_FOLLOWLOCATION, etc.
// Clear the response string for each page
responseString.clear();
res = curl_easy_perform(curl);
if (res == CURLE_OK)
{
std::smatch matches;
// Use regex to search for and extract the question text
while (std::regex_search(responseString, matches, question_regex))
{
std::string link = matches[1].str(); // This is the captured question text
std::string question = matches[2].str();
std::cout << "Question: " << question << std::endl;
std::cout << "Link: " << link << std::endl;
std::string answer_html = http_get(link);
std::vector<std::string> answer_and_link = parse_and_extract_answer_and_link(answer_html);
std::cout << "Answer: " << answer_and_link[0] << std::endl;
// Erase up to the end of the matched <h2> tag to continue the search
qna_pairs.push_back({question, answer_and_link[0], answer_and_link[1]});
responseString = matches.suffix().str();
}
}
else
{
// Handle errors
std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl;
}
}
// Always clean up
curl_easy_cleanup(curl);
}
// Now, serialize the questions to a JSON string
std::string jsonString = "{\"Maliki\": [\n";
for (const auto &qna : qna_pairs)
{
jsonString += "\t{\"Question\": \"" + escape_json(qna.question) +
"\", \"Answer\": \"" + escape_json(qna.answer) +
"\", \"Source\": \"" + escape_json(qna.source) + "\"},\n";
}
if (!jsonString.empty())
{
jsonString.pop_back(); // Remove the trailing comma
jsonString.pop_back(); // Remove the newline
}
jsonString += "\n]}";
// Write the JSON string to a file
std::ofstream jsonFile("maliki_questions.json");
if (jsonFile.is_open())
{
jsonFile << jsonString;
jsonFile.close();
std::cout << "Questions saved to maliki_questions.json" << std::endl;
}
else
{
std::cerr << "Failed to open file for writing." << std::endl;
}
return 0;
}