Skip to content

Commit 6c7761d

Browse files
committed
feat: Add C++ backend for strings module
- Implemented C++ backend for string algorithms (KMP, Rabin-Karp, Boyer-Moore, Z-function). - Implemented C++ backend for Trie data structure. - Added Python-C++ interface using `Python.h`. - Updated `_extensions.py` and `setup.py` to integrate the new backend.
1 parent 4d1116d commit 6c7761d

File tree

10 files changed

+420
-0
lines changed

10 files changed

+420
-0
lines changed

pydatastructs/strings/_backend/__init__.py

Whitespace-only changes.
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#include "algorithms.hpp"
2+
#include <vector>
3+
#include <string>
4+
#include <unordered_map>
5+
#include <cmath>
6+
7+
// Knuth-Morris-Pratt Algorithm
8+
std::vector<int> kmp_search(const std::string& text, const std::string& query) {
9+
std::vector<int> positions;
10+
if (text.empty() || query.empty()) return positions;
11+
12+
// Build KMP table
13+
std::vector<int> kmp_table(query.size() + 1, 0);
14+
kmp_table[0] = -1;
15+
int pos = 1, cnd = 0;
16+
while (pos < query.size()) {
17+
if (query[pos] == query[cnd]) {
18+
kmp_table[pos] = kmp_table[cnd];
19+
} else {
20+
kmp_table[pos] = cnd;
21+
while (cnd >= 0 && query[pos] != query[cnd]) {
22+
cnd = kmp_table[cnd];
23+
}
24+
}
25+
pos++, cnd++;
26+
}
27+
kmp_table[pos] = cnd;
28+
29+
// Perform search
30+
int j = 0, k = 0;
31+
while (j < text.size()) {
32+
if (query[k] == text[j]) {
33+
j++, k++;
34+
if (k == query.size()) {
35+
positions.push_back(j - k);
36+
k = kmp_table[k];
37+
}
38+
} else {
39+
k = kmp_table[k];
40+
if (k < 0) {
41+
j++, k++;
42+
}
43+
}
44+
}
45+
46+
return positions;
47+
}
48+
49+
// Rabin-Karp Algorithm
50+
std::vector<int> rabin_karp_search(const std::string& text, const std::string& query) {
51+
std::vector<int> positions;
52+
if (text.empty() || query.empty()) return positions;
53+
54+
const int PRIME = 257;
55+
const int MOD = 1000000007;
56+
int t = text.size(), q = query.size();
57+
long long query_hash = 0, text_hash = 0, power = 1;
58+
59+
// Precompute power
60+
for (int i = 0; i < q - 1; i++) {
61+
power = (power * PRIME) % MOD;
62+
}
63+
64+
// Compute hash for query and first window of text
65+
for (int i = 0; i < q; i++) {
66+
query_hash = (query_hash * PRIME + query[i]) % MOD;
67+
text_hash = (text_hash * PRIME + text[i]) % MOD;
68+
}
69+
70+
// Slide the window over the text
71+
for (int i = 0; i <= t - q; i++) {
72+
if (query_hash == text_hash) {
73+
if (text.substr(i, q) == query) {
74+
positions.push_back(i);
75+
}
76+
}
77+
if (i < t - q) {
78+
text_hash = (PRIME * (text_hash - text[i] * power) + text[i + q]) % MOD;
79+
if (text_hash < 0) text_hash += MOD;
80+
}
81+
}
82+
83+
return positions;
84+
}
85+
86+
// Boyer-Moore Algorithm
87+
std::vector<int> boyer_moore_search(const std::string& text, const std::string& query) {
88+
std::vector<int> positions;
89+
if (text.empty() || query.empty()) return positions;
90+
91+
// Preprocessing
92+
std::unordered_map<char, int> bad_match_table;
93+
for (int i = 0; i < query.size(); i++) {
94+
bad_match_table[query[i]] = i;
95+
}
96+
97+
// Searching
98+
int shift = 0;
99+
while (shift <= text.size() - query.size()) {
100+
int j = query.size() - 1;
101+
while (j >= 0 && query[j] == text[shift + j]) {
102+
j--;
103+
}
104+
if (j < 0) {
105+
positions.push_back(shift);
106+
shift += (shift + query.size() < text.size()) ? query.size() - bad_match_table[text[shift + query.size()]] : 1;
107+
} else {
108+
shift += std::max(1, j - bad_match_table[text[shift + j]]);
109+
}
110+
}
111+
112+
return positions;
113+
}
114+
115+
// Z-Function Algorithm
116+
std::vector<int> z_function_search(const std::string& text, const std::string& query) {
117+
std::vector<int> positions;
118+
if (text.empty() || query.empty()) return positions;
119+
120+
std::string combined = query + "$" + text;
121+
std::vector<int> z(combined.size(), 0);
122+
int l = 0, r = 0;
123+
124+
for (int i = 1; i < combined.size(); i++) {
125+
if (i <= r) {
126+
z[i] = std::min(r - i + 1, z[i - l]);
127+
}
128+
while (i + z[i] < combined.size() && combined[z[i]] == combined[i + z[i]]) {
129+
z[i]++;
130+
}
131+
if (i + z[i] - 1 > r) {
132+
l = i, r = i + z[i] - 1;
133+
}
134+
}
135+
136+
for (int i = query.size() + 1; i < combined.size(); i++) {
137+
if (z[i] == query.size()) {
138+
positions.push_back(i - query.size() - 1);
139+
}
140+
}
141+
142+
return positions;
143+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef STRINGS_ALGORITHMS_HPP
2+
#define STRINGS_ALGORITHMS_HPP
3+
4+
#include <Python.h>
5+
#include <vector>
6+
#include <string>
7+
#include <unordered_map>
8+
#include <cmath>
9+
10+
// Knuth-Morris-Pratt Algorithm
11+
std::vector<int> kmp_search(const std::string& text, const std::string& query);
12+
13+
// Rabin-Karp Algorithm
14+
std::vector<int> rabin_karp_search(const std::string& text, const std::string& query);
15+
16+
// Boyer-Moore Algorithm
17+
std::vector<int> boyer_moore_search(const std::string& text, const std::string& query);
18+
19+
// Z-Function Algorithm
20+
std::vector<int> z_function_search(const std::string& text, const std::string& query);
21+
22+
#endif
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#include <Python.h>
2+
#include "algorithms/algorithms.hpp"
3+
#include "trie/trie.hpp"
4+
#include "utils/_backend/cpp/string.hpp"
5+
6+
// Python wrapper for KMP algorithm
7+
static PyObject* py_kmp_search(PyObject* self, PyObject* args) {
8+
PyObject *text_obj, *query_obj;
9+
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
10+
return NULL;
11+
}
12+
std::string text = pyobj_to_string(text_obj);
13+
std::string query = pyobj_to_string(query_obj);
14+
std::vector<int> positions = kmp_search(text, query);
15+
return vector_to_pylist(positions);
16+
}
17+
18+
// Python wrapper for Rabin-Karp algorithm
19+
static PyObject* py_rabin_karp_search(PyObject* self, PyObject* args) {
20+
PyObject *text_obj, *query_obj;
21+
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
22+
return NULL;
23+
}
24+
std::string text = pyobj_to_string(text_obj);
25+
std::string query = pyobj_to_string(query_obj);
26+
std::vector<int> positions = rabin_karp_search(text, query);
27+
return vector_to_pylist(positions);
28+
}
29+
30+
// Python wrapper for Boyer-Moore algorithm
31+
static PyObject* py_boyer_moore_search(PyObject* self, PyObject* args) {
32+
PyObject *text_obj, *query_obj;
33+
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
34+
return NULL;
35+
}
36+
std::string text = pyobj_to_string(text_obj);
37+
std::string query = pyobj_to_string(query_obj);
38+
std::vector<int> positions = boyer_moore_search(text, query);
39+
return vector_to_pylist(positions);
40+
}
41+
42+
// Python wrapper for Z-function algorithm
43+
static PyObject* py_z_function_search(PyObject* self, PyObject* args) {
44+
PyObject *text_obj, *query_obj;
45+
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
46+
return NULL;
47+
}
48+
std::string text = pyobj_to_string(text_obj);
49+
std::string query = pyobj_to_string(query_obj);
50+
std::vector<int> positions = z_function_search(text, query);
51+
return vector_to_pylist(positions);
52+
}
53+
54+
// Define the module's method table
55+
static PyMethodDef StringsMethods[] = {
56+
{"kmp_search", py_kmp_search, METH_VARARGS, "Perform KMP search"},
57+
{"rabin_karp_search", py_rabin_karp_search, METH_VARARGS, "Perform Rabin-Karp search"},
58+
{"boyer_moore_search", py_boyer_moore_search, METH_VARARGS, "Perform Boyer-Moore search"},
59+
{"z_function_search", py_z_function_search, METH_VARARGS, "Perform Z-function search"},
60+
{NULL, NULL, 0, NULL}
61+
};
62+
63+
// Define the module
64+
static struct PyModuleDef stringsmodule = {
65+
PyModuleDef_HEAD_INIT,
66+
"_strings",
67+
NULL,
68+
-1,
69+
StringsMethods
70+
};
71+
72+
// Module initialization function
73+
PyMODINIT_FUNC PyInit__strings(void) {
74+
return PyModule_Create(&stringsmodule);
75+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#include "trie.hpp"
2+
3+
TrieNode::~TrieNode() {
4+
for (auto& pair : children) {
5+
delete pair.second;
6+
}
7+
}
8+
9+
Trie::Trie() {
10+
root = new TrieNode();
11+
}
12+
13+
Trie::~Trie() {
14+
delete root;
15+
}
16+
17+
void Trie::insert(const std::string& word) {
18+
TrieNode* current = root;
19+
for (char ch : word) {
20+
if (current->children.find(ch) == current->children.end()) {
21+
current->children[ch] = new TrieNode(ch);
22+
}
23+
current = current->children[ch];
24+
}
25+
current->is_terminal = true;
26+
}
27+
28+
bool Trie::search(const std::string& word) {
29+
TrieNode* current = root;
30+
for (char ch : word) {
31+
if (current->children.find(ch) == current->children.end()) {
32+
return false;
33+
}
34+
current = current->children[ch];
35+
}
36+
return current->is_terminal;
37+
}
38+
39+
bool Trie::starts_with(const std::string& prefix) {
40+
TrieNode* current = root;
41+
for (char ch : prefix) {
42+
if (current->children.find(ch) == current->children.end()) {
43+
return false;
44+
}
45+
current = current->children[ch];
46+
}
47+
return true;
48+
}
49+
50+
std::vector<std::string> Trie::strings_with_prefix(const std::string& prefix) {
51+
std::vector<std::string> result;
52+
TrieNode* current = root;
53+
for (char ch : prefix) {
54+
if (current->children.find(ch) == current->children.end()) {
55+
return result;
56+
}
57+
current = current->children[ch];
58+
}
59+
// Perform DFS to collect all strings with the given prefix
60+
std::vector<std::pair<TrieNode*, std::string>> stack;
61+
stack.push_back({current, prefix});
62+
while (!stack.empty()) {
63+
auto [node, str] = stack.back();
64+
stack.pop_back();
65+
if (node->is_terminal) {
66+
result.push_back(str);
67+
}
68+
for (auto& pair : node->children) {
69+
stack.push_back({pair.second, str + pair.first});
70+
}
71+
}
72+
return result;
73+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#ifndef STRINGS_TRIE_HPP
2+
#define STRINGS_TRIE_HPP
3+
4+
#include <Python.h>
5+
#include <unordered_map>
6+
#include <vector>
7+
#include <string>
8+
9+
class TrieNode {
10+
public:
11+
char character;
12+
bool is_terminal;
13+
std::unordered_map<char, TrieNode*> children;
14+
15+
TrieNode(char ch = '\0') : character(ch), is_terminal(false) {}
16+
~TrieNode();
17+
};
18+
19+
class Trie {
20+
public:
21+
TrieNode* root;
22+
23+
Trie();
24+
~Trie();
25+
26+
void insert(const std::string& word);
27+
bool search(const std::string& word);
28+
bool starts_with(const std::string& prefix);
29+
std::vector<std::string> strings_with_prefix(const std::string& prefix);
30+
};
31+
32+
#endif

pydatastructs/strings/_extension.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from setuptools import Extension
2+
3+
project = 'pydatastructs'
4+
5+
module = 'strings'
6+
7+
backend = '_backend'
8+
9+
cpp = 'cpp'
10+
11+
# Define the extension for string algorithms
12+
algorithms = '.'.join([project, module, backend, cpp, '_algorithms'])
13+
algorithms_sources = [
14+
'/'.join([project, module, backend, cpp, 'algorithms', 'algorithms.cpp']),
15+
'/'.join([project, "utils", "_backend", "cpp", "string.cpp"])
16+
]
17+
18+
# Define the extension for the Trie data structure
19+
trie = '.'.join([project, module, backend, cpp, '_trie'])
20+
trie_sources = [
21+
'/'.join([project, module, backend, cpp, 'trie', 'trie.cpp']),
22+
'/'.join([project, "utils", "_backend", "cpp", "string.cpp"])
23+
]
24+
25+
# Define the extension for the main strings module
26+
strings = '.'.join([project, module, backend, cpp, '_strings'])
27+
strings_sources = [
28+
'/'.join([project, module, backend, cpp, 'strings.cpp'])
29+
]
30+
31+
extensions = [
32+
Extension(algorithms, sources=algorithms_sources),
33+
Extension(trie, sources=trie_sources),
34+
Extension(strings, sources=strings_sources)
35+
]

0 commit comments

Comments
 (0)