Skip to content

Commit 65108f2

Browse files
author
johnholt
committed
Starting Point
Signed-off-by: johnholt <john.d.holt@lexisnexis.com>
0 parents  commit 65108f2

41 files changed

Lines changed: 5174 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Bundle.ecl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
IMPORT Std;
2+
EXPORT Bundle := MODULE(Std.BundleBase)
3+
EXPORT Name := 'TextSearch';
4+
EXPORT Description := 'Text search Framework';
5+
EXPORT Authors := ['HPCCSystems'];
6+
EXPORT License := 'See LICENSE.TXT';
7+
EXPORT Copyright := 'Copyright (C) 2017 HPCC Systems';
8+
EXPORT DependsOn := [];
9+
EXPORT Version := '1.0.0';
10+
END;

LICENSE.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Copyright (C) 2017 HPCC Systems
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy of
4+
this software and associated documentation files (the "Software"), to deal in
5+
the Software without restriction, including without limitation the rights to
6+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7+
of the Software, and to permit persons to whom the Software is furnished to do
8+
so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in all
11+
copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19+
SOFTWARE.

README

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
TextSearch
2+
==========
3+
4+
TextSearch ECL code
5+
Initial version of attributes.
6+
7+
Attributes structured as:
8+
Common: Layouts, types, and other common definitions.
9+
Inverted: Attributes that define the inversion and related.
10+
Regression: Attributes that define regression tests.
11+
Resolved: Attributes that define the answer set.

TextSearch/Common/Constants.ecl

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
EXPORT Constants := MODULE
2+
// Limit Constants
3+
EXPORT Max_SearchTerms := 1000;
4+
EXPORT Max_Ops := 2 * Max_SearchTerms;
5+
EXPORT Max_Hits := 1000000; // Max hits to use in processing
6+
EXPORT Max_Merge_Input := 100; // Max active merge inputs
7+
EXPORT Max_Wild := 10000;
8+
EXPORT Max_DocHits := 1000; // Max hits per document to keep
9+
EXPORT Max_Depth := 100;
10+
EXPORT Max_Prop_Name := 50;
11+
EXPORT Max_Prop_Value := 300;
12+
EXPORT Max_Path_Nominals :=100;
13+
EXPORT Max_Docs_Complex := 2000000;
14+
EXPORT Max_Rqst_Length := 8192;
15+
EXPORT Max_Token_Length:= 512;
16+
EXPORT Max_Msg_Length := 75;
17+
EXPORT Max_Types := 5;
18+
EXPORT Max_Node_Depth := 50;
19+
EXPORT Key_Levels := 5; // numbered 0 to 4, need to change by level attrs
20+
21+
// Nominal Constants
22+
EXPORT Nominal_SeqKey := 1024;
23+
EXPORT Nominal_DocEntry := 1025;
24+
EXPORT Nominal_DocBegin := 1026;
25+
EXPORT Nominal_DocEnd := 1027;
26+
EXPORT Nominal_Noone := 1028;
27+
28+
// Message Constants
29+
SHARED Base := 1000; // may need to change this
30+
EXPORT OtherCharsInText_Msg := 'Unknown characters found in text';
31+
EXPORT OtherCharsInText_Code:= Base + 1;
32+
EXPORT MaxSrchTerms_Msg := 'Maximum number of search terms exceeded';
33+
EXPORT MaxSrchTerms_Code:= Base + 2;
34+
EXPORT MaxMerge_Msg := 'Internal error, max merge inputs exceeded';
35+
EXPORT MaxMerge_Code := Base + 3;
36+
EXPORT DepthDiff_Msg := 'Depth increased by more than 1';
37+
EXPORT DepthDiff_Code:= Base + 4;
38+
EXPORT BadDepth_Msg := 'First depth for doc not 1';
39+
EXPORT BadDepth_Code:= Base + 5;
40+
EXPORT BadParse_Msg := 'Parse error: ';
41+
EXPORT BadParse_Code:= Base + 6;
42+
EXPORT Literal_Msg := U'Searched as literal character';
43+
EXPORT Literal_Code := Base + 7;
44+
EXPORT Word_Msg := U'Taken to be a search term';
45+
EXPORT Word_Code := Base + 8;
46+
EXPORT XtraLG_Msg := U'Missing Right Grouping Operator';
47+
EXPORT XtraLG_Code := Base + 9;
48+
EXPORT XtraRG_Msg := U'Extra Right grouping Operator';
49+
EXPORT XtraRG_Code := Base + 10;
50+
EXPORT IllConn_Msg := U'Connector illegal here';
51+
EXPORT IllConn_Code := Base + 11;
52+
EXPORT NoConn_Msg := U'Missing connector';
53+
EXPORT NoConn_Code := Base + 12;
54+
EXPORT IllThis_Msg := U'This connector illegal here';
55+
EXPORT IllThis_Code := Base + 13;
56+
EXPORT ExtraEP_Msg := U'Missing Left SQB';
57+
EXPORT ExtraEP_Code := Base + 14;
58+
EXPORT MissedQT_Msg := U'Missing end quote';
59+
EXPORT MissedQT_Code:= Base + 15;
60+
EXPORT MissedEP_Msg := U'Missing Right SQB';
61+
EXPORT MissedEP_Code:= Base + 16;
62+
EXPORT Ill_Pred_Msg := U'Predicate illegal here';
63+
EXPORT Ill_Pred_Code:= Base + 17;
64+
EXPORT AnyTag_Msg := U'Any Element/Attribute not supported';
65+
EXPORT AnyTag_Code := Base + 18;
66+
EXPORT Syntax_Msg := U'Syntax error, pending ops';
67+
EXPORT Syntax_Code := Base + 19;
68+
EXPORT Ill_Flt_Msg := U'Filter Illegal here';
69+
EXPORT Ill_Flt_Code := Base + 20;
70+
EXPORT No_Prfx_Msg := U'No prefix provided for File Information';
71+
EXPORT No_Prfx_Code := Base + 21;
72+
END;
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
//Default implementation. Provides minimal functionality.
2+
IMPORT Std.Uni;
3+
IMPORT TextSearch.Common;
4+
IMPORT TextSearch.Common.Types;
5+
IMPORT TextSearch.Common.Layouts;
6+
TermString := Types.TermString;
7+
EquivTerm := Layouts.EquivTerm;
8+
Version := Types.Version;
9+
NoEquiv := DATASET([],EquivTerm);
10+
ToUpper := Uni.ToUpperCase;
11+
12+
EXPORT Default_Keywording := MODULE(Common.IKeywording)
13+
EXPORT Version currentVersion := 0;
14+
EXPORT BOOLEAN hasEquivalence(TermString trm, Version v=0) := FALSE;
15+
EXPORT TermString SingleKeyword(TermString trm, Version v=0) := ToUpper(trm);
16+
EXPORT DATASET(EquivTerm) EquivKeywords(TermString trm, Version v=0) := noEquiv;
17+
END;
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
EXPORT FileName_Info := INTERFACE
2+
EXPORT STRING Prefix;
3+
EXPORT STRING Instance; // the version for an individual instance or the Alias
4+
EXPORT STRING AliasInstance := 'CURRENT';
5+
EXPORT UNSIGNED2 Naming := 1;
6+
EXPORT UNSIGNED2 DataVersion := 0;
7+
EXPORT UNSIGNED1 Levels := 5;
8+
END;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
//Instance of the FileName_Info block. Used to unify the names used by TextSearch.
2+
IMPORT TextSearch.Common;
3+
IMPORT STD.Str;
4+
Info := Common.FileName_Info;
5+
EXPORT FileName_Info_Instance(STRING aPre, STRING aInst) := MODULE(Info)
6+
STRING wPrefix := TRIM(Str.ToUpperCase(aPre),ALL);
7+
EXPORT STRING Prefix := IF(wPrefix<>'',
8+
wPrefix,
9+
FAIL(STRING,
10+
Common.Constants.No_Prfx_code,
11+
(STRING)Common.Constants.No_Prfx_Msg));
12+
STRING wInst := TRIM(Str.ToUpperCase(aInst),ALL);
13+
EXPORT STRING Instance := IF(wInst<>'', wInst, AliasInstance);
14+
END;

TextSearch/Common/FileNames.ecl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
IMPORT TextSearch.Common;
2+
//Creates file names. The names are both the names of the individual
3+
//logical files and the container names used as aliases for a group
4+
//of file instances.
5+
//
6+
//The form of the file name is:
7+
//<Prefix>::DocSearch::Level<xx>::<Instance>::<Suffix>
8+
//where: Prefix is FileName_Info.Prefix; xx is a level number 0 to 4;
9+
// Instance is FileName.Instance; and Suffix is the data type as below.
10+
FileName_Info := Common.FileName_Info;
11+
12+
EXPORT FileNames(FileName_Info info) := MODULE
13+
SHARED DocSearchPrefix := '::DocSearch::Level-';
14+
SHARED Name(STRING suffix, UNSIGNED lvl) := info.Prefix + DocSearchPrefix
15+
+ INTFORMAT(lvl, 2, 1) + '::'
16+
+ info.Instance + '::' + suffix;
17+
18+
EXPORT DocumentIndex(UNSIGNED lvl=0) := Name('DocIndx', lvl);
19+
EXPORT TriGramDictionary(UNSIGNED lvl=0) := Name('TriDctIndx', lvl);
20+
EXPORT TermDictionary(UNSIGNED lvl=0) := Name('DictIndx', lvl);
21+
EXPORT TriGramIndex(UNSIGNED lvl=0) := Name('TriGramIndx', lvl);
22+
EXPORT TermIndex(UNSIGNED lvl=0) := Name('TermIndx', lvl);
23+
EXPORT PhraseIndex(UNSIGNED lvl=0) := Name('PhraseIndx', lvl);
24+
EXPORT ElementIndex(UNSIGNED lvl=0) := Name('ElemIndx', lvl);
25+
EXPORT AttributeIndex(UNSIGNED lvl=0) := Name('AttrIndx', lvl);
26+
EXPORT RangeIndex(UNSIGNED lvl=0) := Name('RngIndx', lvl);
27+
EXPORT NameSpaceDict(UNSIGNED lvl=0) := Name('SpaceIndx', lvl);
28+
EXPORT TagDictionary(UNSIGNED lvl=0) := Name('TagIndx', lvl);
29+
EXPORT IdentIndx(UNSIGNED1 lvl=0) := Name('IdentIndx', lvl);
30+
EXPORT DeleteIndex(UNSIGNED1 lvl=0) := NAME('DelIndx', lvl);
31+
END;

TextSearch/Common/IKeywording.ecl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
//Interface for keywording routine. Provides normal form or forms.
2+
//
3+
IMPORT TextSearch.Common.Types;
4+
IMPORT TextSearch.Common.Layouts;
5+
TermString := Types.TermString;
6+
EquivTerm := Layouts.EquivTerm;
7+
Version := Types.Version;
8+
9+
EXPORT IKeywording := INTERFACE
10+
EXPORT Version currentVersion;
11+
EXPORT BOOLEAN hasEquivalence(TermString trm, Version v=currentVersion);
12+
EXPORT TermString SingleKeyword(TermString trm, Version v=currentVersion);
13+
EXPORT DATASET(EquivTerm) EquivKeywords(TermString trm, Version v=currentVersion);
14+
END;

TextSearch/Common/Keys.ecl

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
IMPORT TextSearch.Common;
2+
3+
// Aliases
4+
FileName_Info := Common.FileName_Info;
5+
FileNames := Common.FileNames;
6+
Types := Common.Types;
7+
TermDictionaryEntry := Common.Layouts.TermDictionaryEntry;
8+
TagDictionaryEntry := COmmon.Layouts.TagDictionaryEntry;
9+
TermPosting := Common.Layouts.TermPosting;
10+
TagPosting := Common.Layouts.tagPosting;
11+
PhrasePosting := Common.Layouts.PhrasePosting;
12+
DocIndex := Common.Layouts.DocIndex;
13+
DeletedDoc := Common.Layouts.DeletedDoc;
14+
// Default streams
15+
emptyDict := DATASET([], TermDictionaryEntry);
16+
emptyTagD := DATASET([], TagDictionaryEntry);
17+
emptyTerm := DATASET([], TermPosting);
18+
emptyTagP := DATASET([], TagPosting);
19+
emptyPhrs := DATASET([], PhrasePosting);
20+
emtpyDocs := DATASET([], DocIndex);
21+
emptyDelx := DATASET([], DeletedDoc);
22+
23+
EXPORT Keys(FileName_Info info, UNSIGNED1 lvl=0) := MODULE
24+
// Term dictionary
25+
EXPORT TermDictionary(DATASET(TermDictionaryEntry) d=emptyDict)
26+
:= INDEX(d, {typTerm, UNICODE20 kw20:=kw[1..20], termNominal},
27+
{termFreq, docFreq, kw, term},
28+
FileNames(info).TermDictionary(lvl), SORTED);
29+
30+
// Tag Dictionary
31+
EXPORT TagDictionary(DATASET(TagDictionaryEntry) d=emptyTagD)
32+
:= INDEX(d, {UNICODE20 tag20:=tagName[1..20], typData, tagNominal,
33+
pathLen},
34+
{pathNominal, tagName, pathString},
35+
FileNames(info).TagDictionary(lvl), SORTED);
36+
// Term Inversion
37+
EXPORT TermIndex(DATASET(TermPosting) d=emptyTerm)
38+
:= INDEX(d, {typTerm, termNominal, id, kwpBegin, start, kwpEnd, stop,
39+
pathNominal, parentNominal, preorder, parentOrd},
40+
{depth, lp, typData, kw, term},
41+
FileNames(info).TermIndex(lvl), SORTED);
42+
43+
// ELement Inversion
44+
EXPORT ElementIndex(DATASET(TagPosting) d=emptyTagP)
45+
:= INDEX(d(typData IN Types.ElementDTypes),
46+
{tagNominal, id, kwpBegin, start, kwpEnd, stop, pathNominal,
47+
parentNominal, parentOrd, depth, preorder, typData},
48+
{lenText, kwsText, lastOrd, tagName},
49+
FileNames(info).ElementIndex(lvl), SORTED);
50+
51+
// Phrase Index keys
52+
EXPORT PhraseIndex(DATASET(PhrasePosting) d=emptyPhrs)
53+
:= INDEX(d, {nominal1, nominal2, id, kwpBegin, start, kwpEnd, stop,
54+
pathNominal, parentNominal, preorder, parentOrd},
55+
{kw1, lp1, term1, kw2, lp2, term2},
56+
FileNames(info).PhraseIndex(lvl), SORTED);
57+
58+
// Attribute index
59+
EXPORT AttributeIndex(DATASET(TagPosting) d=emptyTagP)
60+
:= INDEX(d(typData IN Types.AttribDTypes),
61+
{tagNominal, UNICODE10 v10:=tagValue[1..10], parentNominal, id,
62+
kwpBegin, start, kwpEnd, stop, pathNominal, preorder, parentOrd},
63+
{typData, tagName, tagValue, pathString},
64+
FileNames(info).AttributeIndex(lvl), SORTED);
65+
66+
// Attribue Range Index
67+
EXPORT RangeIndex(DATASET(TagPosting) d=emptyTagP)
68+
:= INDEX(d(typData IN Types.AttribDTypes),
69+
{tagNominal, parentNominal, id, kwpBegin, start, kwpEnd, stop,
70+
pathNominal, preorder, parentOrd, UNICODE10 v10:=tagValue[1..10]},
71+
{typData, tagName, tagValue, pathString},
72+
FileNames(info).RangeIndex(lvl), SORTED);
73+
74+
// Document Index
75+
EXPORT DocumentIndex(DATASET(DocIndex) d=emtpyDocs)
76+
:= INDEX(d, {id, keywords, docLength, seqKey}, {identifier, slugLine, wunit},
77+
FileNames(info).DocumentIndex(lvl), SORTED, OPT);
78+
79+
// Deleted document index
80+
EXPORT DeleteIndex(DATASET(DeletedDoc) d=emptyDelx)
81+
:= INDEX(d, {id}, {identifier}, FileNames(info).DeleteIndex(lvl), SORTED, OPT);
82+
83+
// Document Ident index
84+
EXPORT IdentIndex(DATASET(DocIndex) d=emtpyDocs)
85+
:= INDEX(d, {Types.Nominal nominal:=HASH32(identifier), id},
86+
{identifier},
87+
FileNames(info).IdentIndx(lvl), SORTED, OPT);
88+
END;

0 commit comments

Comments
 (0)