Skip to content
Open

Dev #42

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions dt/include/da2.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ namespace npbnlp {
}

long _check_acceptable(long c, long b, long m) {
if (m < 0)
return 0;
//long k = _base[b] + c + m;
long k = c + m;
if (!_is_empty(k))
Expand Down Expand Up @@ -422,15 +424,17 @@ namespace npbnlp {
long m = 0;
while (1) {
m = h - sibling[0];
auto c = h;
//auto c = h;
h = -_check[h];
/*
if (m < 0) {
if (c == _tail) {
_extend();
h = _head;
}
continue;
}
*/
bool accept = true;
for (auto& s : sibling) {
if (!_check_acceptable(s, b, m)) {
Expand Down Expand Up @@ -489,15 +493,17 @@ namespace npbnlp {
long m = 0;
while (1) {
m = h - _c[subtree.sibling[0].id];
auto c = h;
//auto c = h;
h = -_check[h];
/*
if (m < 0) {
if (c == _tail) {
_extend();
h = _head;
}
continue;
}
*/
bool accept = true;
for (auto& s : subtree.sibling) {
if (!_check_acceptable(s, b, m)) {
Expand Down
20 changes: 20 additions & 0 deletions io/include/chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace npbnlp {
int head;
int len;
int id;
int type;
std::vector<int> n;
friend std::ostream& operator<<(std::ostream& os, const chunk& c) {
for (auto i = 0; i < c.len; ++i) {
Expand All @@ -38,6 +39,7 @@ namespace npbnlp {
if (i < c.len-1)
os << " ";
}
os << ":" << c.type;
if (c.id > 0)
os << ":" << c.id;
if (c.k > 0)
Expand All @@ -49,19 +51,35 @@ namespace npbnlp {
private:
};
struct chash {
size_t operator() (const chunk& c) const {
size_t seed = c.len;
for (int i = 0; i < c.len; ++i) {
auto x = c[i];
x = ((x >> 16) ^ x) * 0x45d9f3b;
x = ((x >> 16) ^ x) * 0x45d9f3b;
x = (x >> 16) ^ x;
seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
return seed;
}
/*
size_t operator() (const chunk& c) const {
size_t id = 19780211;
for (int i = 0; i < c.len; ++i)
id = id*37*c[i];
//id = id*37*c.wd(i).pos;
return id;
}
*/
};
struct ccmp {
bool operator() (const chunk& a, const chunk& b) const {
if (a.len != b.len)
return false;
int i = 0;
for (; a[i] == b[i] && i < a.len; ++i);
//return true;
//for (; a.wd(i).pos == b.wd(i).pos && i < a.len; ++i);
return (i == a.len);
}
};
Expand All @@ -81,6 +99,8 @@ namespace npbnlp {
static std::mutex _mutex;
static std::shared_ptr<std::vector<word> > _word;
static std::shared_ptr<std::vector<unsigned int> > _letter;
static std::shared_ptr<std::vector<word> > _load_word;
static std::shared_ptr<std::vector<unsigned int> > _load_letter;
int _id;
cdic _index;
std::vector<int> _misn;
Expand Down
15 changes: 15 additions & 0 deletions io/include/cio.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#ifndef NPBNLP_CHUNK_IO
#define NPBNLP_CHUNK_IO

#include"io.h"
namespace npbnlp {
class cio {
public:
cio(const char *f);
cio(std::istream& in);
virtual ~cio();
std::shared_ptr<std::vector<io> > chunk;
};
}

#endif
2 changes: 2 additions & 0 deletions io/include/word.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ namespace npbnlp {
return false;
int i = 0;
for (; a[i] == b[i] && i < a.len; ++i);
//return true;
return (i == a.len);
}
};
Expand All @@ -101,6 +102,7 @@ namespace npbnlp {
static std::shared_ptr<wid> _idx;
static std::mutex _mutex;
static std::shared_ptr<std::vector<unsigned int> > _letter;
static std::shared_ptr<std::vector<unsigned int> > _load_letter;
int _id;
wdic _index;
std::vector<int> _misn;
Expand Down
116 changes: 95 additions & 21 deletions io/src/chunk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@ shared_ptr<cid> cid::_idx;
mutex cid::_mutex;
shared_ptr<vector<unsigned int> > cid::_letter;
shared_ptr<vector<word> > cid::_word;
shared_ptr<vector<unsigned int> > cid::_load_letter;
shared_ptr<vector<word> > cid::_load_word;

shared_ptr<cid> cid::create() {
lock_guard<mutex> lock(_mutex);
if (_idx == nullptr) {
_idx = shared_ptr<cid>(new cid(4/*2*/));
_word = make_shared<vector<word> >();
_letter = make_shared<vector<unsigned int> >();
_load_word = make_shared<vector<word> >();
_load_letter = make_shared<vector<unsigned int> >();
}
return _idx;
}
Expand All @@ -26,8 +30,8 @@ int cid::operator[](chunk& c) {
}

int cid::index(chunk& c) {
lock_guard<mutex> m(_mutex);
if (_index.find(c) == _index.end()) {
lock_guard<mutex> m(_mutex);
if (!_misn.empty()) {
_index[c] = _misn[_misn.size()-1];
_misn.pop_back();
Expand All @@ -39,9 +43,9 @@ int cid::index(chunk& c) {
}

void cid::remove(chunk& c) {
lock_guard<mutex> m(_mutex);
auto it = _index.find(c);
if (it != _index.end()) {
lock_guard<mutex> m(_mutex);
_misn.push_back(it->second);
_index.erase(c);
}
Expand Down Expand Up @@ -79,28 +83,39 @@ bool cid::load(const char *file) {
int rawsize = 0;
if (fread(&rawsize, sizeof(int), 1, fp) != 1)
throw "failed to read _letter->size() in cid::load";
_letter->resize(rawsize);
if (fread(&(*_letter)[0], sizeof(unsigned int), rawsize, fp) != (size_t)rawsize)
_load_letter->resize(rawsize);
if (fread(&(*_load_letter)[0], sizeof(unsigned int), rawsize, fp) != (size_t)rawsize)
throw "failed to read _letter in cid::load";
int wsize = 0;
if (fread(&wsize, sizeof(int), 1, fp) != 1)
throw "failed to read size of words in cid::load";
for (int i = 0; i < wsize; ++i) {
word w;
w.load(fp, *_letter);
_word->push_back(w);
w.load(fp, *_load_letter);
_load_word->push_back(w);
}
int csize = 0;
if (fread(&csize, sizeof(int), 1, fp) != 1)
throw "failed to read size of chunk indices in cid::load";
for (int i = 0; i < csize; ++i) {
chunk c;
c.load(fp, *_word);
c.load(fp, *_load_word);
int id = 0;
if (fread(&id, sizeof(int), 1, fp) != 1)
throw "failed to read chunk id in cid::load";
c.id = id;
_index[c] = id;
/*
cout << "load chunk:" << c << " len:" << c.len << " id:" << id;
for (auto j = 0; j < c.len; ++j) {
word& w = c.wd(j);
cout << " " << w << " letter:";
for (auto l = 0; l < w.len; ++l) {
cout << w[l] << " ";
}
}
cout << endl;
*/
}
fclose(fp);
return true;
Expand All @@ -109,20 +124,63 @@ bool cid::load(const char *file) {
void cid::_store(FILE *fp) {
vector<chunk> d;
vector<int> id;
int index = 0;
for (auto it = _index.begin(); it != _index.end(); ++it) {
//cout << "chunk:" << it->first << " id:" << it->second << " len:" << it->first.len;
int chead = _word->size();
//c.id = it->second;
for (auto i = 0; i < it->first.len; ++i) {
word& w = it->first.wd(i);
//cout << " word:" << w;
int head = _letter->size();
for (auto j = 0; j < w.len; ++j) {
//cout << " letter_" << j << ":" << w[j];
_letter->emplace_back(w[j]);
}
word x(*_letter, head, w.len);
x.id = w.id;
x.pos = w.pos;
x.n = w.n;
for (auto j = 0; j < (int)w.m.size(); ++j)
x.m[j] = w.m[j];
/*
for (auto m : w.m) {
x.m.emplace_back(m);
}
*/
_word->emplace_back(x);
}
chunk c(*_word, chead, it->first.len);
c.k = it->first.k;
c.id = it->second;
c.type = it->first.type;
for (auto j = 0; j < (int)it->first.n.size(); ++j)
c.n[j] = it->first.n[j];
//cout << " to store chunk:" << c << " chunk_num:" << index++ << "/" << _index.size() << endl;
d.emplace_back(c);
id.emplace_back(it->second);
//d.emplace_back(chunk(*_word, chead, it->first.len));
/*
chunk c(it->first);
id.push_back(it->second);
c.head = _word->size();
int chead = _word->size();
//c.head = _word->size();
c.id = it->second;
for (auto i = 0; i < c.len; ++i) {
word& w = c.wd(i);
w.head = _letter->size();
int whead = _letter->size();
for (auto j = 0; j < w.len; ++j) {
_letter->push_back(w[j]);
}
w.head = whead;
_word->push_back(w);
}
d.push_back(c);
c.head = chead;
*/
//cout << " to store chunk:" << c << endl;
//d.push_back(c);
}
//cout << "save to dic" << endl;
int lsize = _letter->size();
if (fwrite(&lsize, sizeof(int), 1, fp) != 1)
throw "failed to write size of raw in cid::_store";
Expand All @@ -137,7 +195,15 @@ void cid::_store(FILE *fp) {
int csize = d.size();
if (fwrite(&csize, sizeof(int), 1, fp) != 1)
throw "failed to write size of chunks in cid::_store";
//cout << "save chunks to file" << endl;
for (auto i = 0; i < (int)d.size(); ++i) {
/*
cout << "save_chunk:" << d[i] << " id:" << id[i] << " token:";
for (auto j = 0; j < d[i].len; ++j) {
cout << " " << d[i].wd(j);
}
cout << endl;
*/
d[i].save(fp);
if (fwrite(&id[i], sizeof(int), 1, fp) != 1)
throw "failed to write chunk id in cid::_store";
Expand All @@ -147,38 +213,40 @@ void cid::_store(FILE *fp) {
cid::cid(int id):_id(id) {
}

chunk::chunk():k(0), head(0), len(1), id(0), n(len+1,0), _doc(NULL) {
chunk::chunk():k(0), head(0), len(1), id(0), type(8), n(len+1,0), _doc(NULL) {
}

chunk::chunk(vector<word>& d): k(0), head(0), len(0), id(0), n(len+1,0), _doc(&d) {
chunk::chunk(vector<word>& d): k(0), head(0), len(0), id(0), type(8), n(len+1,0), _doc(&d) {
}

chunk::chunk(vector<word>& d, int head, int len): k(0), head(head), len(len), id(1), n(len+1,0), _doc(&d) {
chunk::chunk(vector<word>& d, int head, int len): k(0), head(head), len(len), id(1), type(8), n(len+1,0), _doc(&d) {
}

chunk::chunk(const chunk& c): k(c.k), head(c.head), len(c.len), id(c.id), _doc(c._doc) {
chunk::chunk(const chunk& c): k(c.k), head(c.head), len(c.len), id(c.id), type(c.type), _doc(c._doc) {
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
}

chunk::chunk(chunk&& c): k(c.k), head(c.head), len(c.len), id(c.id), _doc(c._doc) {
chunk::chunk(chunk&& c): k(c.k), head(c.head), len(c.len), id(c.id), type(c.type), _doc(c._doc) {
n = move(c.n);
c.k = 0;
c.head = 0;
c.len = 1;
c.id = 0;
c.type = 8;
c._doc = nullptr;
/*
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
*/
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
*/
}

chunk& chunk::operator=(const chunk& c) {
k = c.k;
head = c.head;
len = c.len;
id = c.id;
type = c.type;
_doc = c._doc;
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
Expand All @@ -192,17 +260,19 @@ chunk& chunk::operator=(chunk&& c) noexcept {
head = c.head;
len = c.len;
id = c.id;
type = c.type;
_doc = c._doc;
n = move(c.n);
c.k = 0;
c.head = 0;
c.len = 1;
c.id = 0;
c.type = 8;
c._doc = nullptr;
/*
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
*/
for (auto i = c.n.begin(); i < c.n.end(); ++i)
n.push_back(*i);
*/
return *this;
}

Expand Down Expand Up @@ -232,6 +302,8 @@ void chunk::save(FILE *fp) {
throw "failed to write chunk.len";
if (fwrite(&id, sizeof(int), 1, fp) != 1)
throw "failed to write chunk.id";
if (fwrite(&type, sizeof(int), 1, fp) != 1)
throw "failed to write chunk.type";
}

void chunk::load(FILE *fp, vector<word>& w) {
Expand All @@ -245,5 +317,7 @@ void chunk::load(FILE *fp, vector<word>& w) {
throw "failed to read chunk.len";
if (fread(&id, sizeof(int), 1, fp) != 1)
throw "failed to read chunk.id";
if (fread(&type, sizeof(int), 1, fp) != 1)
throw "failed to read chunk.type";
_doc = &w;
}
Loading