forked from goldendict/goldendict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdsl_details.hh
218 lines (172 loc) · 6.33 KB
/
dsl_details.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* This file is (c) 2008-2012 Konstantin Isakov <[email protected]>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __DSL_DETAILS_HH_INCLUDED__
#define __DSL_DETAILS_HH_INCLUDED__
#include <string>
#include <list>
#include <vector>
#include <zlib.h>
#include "dictionary.hh"
#include "iconv.hh"
// Implementation details for Dsl, not part of its interface
namespace Dsl {
namespace Details {
using std::string;
using gd::wstring;
using gd::wchar;
using std::list;
using std::vector;
// Those are possible encodings for .dsl files
enum DslEncoding
{
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
};
struct DSLLangCode
{
int code_id;
char code[ 3 ]; // ISO 639-1
};
string findCodeForDslId( int id );
/// Parses the DSL language, representing it in its structural DOM form.
struct ArticleDom
{
struct Node: public list< Node >
{
bool isTag; // true if it is a tag with subnodes, false if it's a leaf text
// data.
// Those are only used if isTag is true
wstring tagName;
wstring tagAttrs;
wstring text; // This is only used if isTag is false
class Text {};
class Tag {};
Node( Tag, wstring const & name, wstring const & attrs ): isTag( true ),
tagName( name ), tagAttrs( attrs )
{}
Node( Text, wstring const & text_ ): isTag( false ), text( text_ )
{}
/// Concatenates all childen text nodes recursively to form all text
/// the node contains stripped of any markup.
wstring renderAsText( bool stripTrsTag = false ) const;
};
/// Does the parse at construction. Refer to the 'root' member variable
/// afterwards.
ArticleDom( wstring const &, string const & dictName = string(),
wstring const & headword_ = wstring() );
/// Root of DOM's tree
Node root;
private:
void openTag( wstring const & name, wstring const & attr, list< Node * > & stack );
void closeTag( wstring const & name, list< Node * > & stack,
bool warn = true );
wchar const * stringPos;
class eot {};
wchar ch;
bool escaped;
unsigned transcriptionCount; // >0 = inside a [t] tag
void nextChar() throw( eot );
/// Infomation for diagnostic purposes
string dictionaryName;
wstring headword;
};
/// A adapted version of Iconv which takes Dsl encoding and decodes to wchar.
class DslIconv: public Iconv
{
public:
DslIconv( DslEncoding ) throw( Iconv::Ex );
void reinit( DslEncoding ) throw( Iconv::Ex );
/// Returns a name to be passed to iconv for the given dsl encoding.
static char const * getEncodingNameFor( DslEncoding );
};
/// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects
/// the encoding, and reads all headers by itself.
class DslScanner
{
gzFile f;
DslEncoding encoding;
DslIconv iconv;
wstring dictionaryName;
wstring langFrom, langTo;
char readBuffer[ 65536 ];
char * readBufferPtr;
size_t readBufferLeft;
vector< wchar > wcharBuffer;
unsigned linesRead;
public:
DEF_EX( Ex, "Dsl scanner exception", Dictionary::Ex )
DEF_EX_STR( exCantOpen, "Can't open .dsl file", Ex )
DEF_EX( exCantReadDslFile, "Can't read .dsl file", Ex )
DEF_EX_STR( exMalformedDslFile, "The .dsl file is malformed:", Ex )
DEF_EX( exUnknownCodePage, "The .dsl file specified an unknown code page", Ex )
DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really
DslScanner( string const & fileName ) throw( Ex, Iconv::Ex );
~DslScanner() throw();
/// Returns the detected encoding of this file.
DslEncoding getEncoding() const
{ return encoding; }
/// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const
{ return dictionaryName; }
/// Returns the dictionary's source language, as was read from file's headers.
wstring const & getLangFrom() const
{ return langFrom; }
/// Returns the dictionary's target language, as was read from file's headers.
wstring const & getLangTo() const
{ return langTo; }
/// Reads next line from the file. Returns true if reading succeeded --
/// the string gets stored in the one passed, along with its physical
/// file offset in the file (the uncompressed one if the file is compressed).
/// If end of file is reached, false is returned.
/// Reading begins from the first line after the headers (ones which start
/// with #).
bool readNextLine( wstring &, size_t & offset ) throw( Ex, Iconv::Ex );
/// Similar readNextLine but strip all DSL comments {{...}}
bool readNextLineWithoutComments( wstring &, size_t & offset ) throw( Ex, Iconv::Ex );
/// Returns the number of lines read so far from the file.
unsigned getLinesRead() const
{ return linesRead; }
/// Converts the given number of characters to the number of bytes they
/// would occupy in the file, knowing its encoding. It's possible to know
/// that because no multibyte encodings are supported in .dsls.
inline size_t distanceToBytes( size_t ) const;
};
/// This function either removes parts of string enclosed in braces, or leaves
/// them intact. The braces themselves are removed always, though.
void processUnsortedParts( wstring & str, bool strip );
/// Expands optional parts of a headword (ones marked with parentheses),
/// producing all possible combinations where they are present or absent.
void expandOptionalParts( wstring & str, list< wstring > * result,
size_t x = 0, bool inside_recurse = false );
/// Expands all unescaped tildes, inserting tildeReplacement text instead of
/// them.
void expandTildes( wstring & str, wstring const & tildeReplacement );
/// Unescapes any escaped chars. Be sure to handle all their special meanings
/// before unescaping them.
void unescapeDsl( wstring & str );
/// Normalizes the headword. Currently turns any sequences of consecutive spaces
/// into a single space.
void normalizeHeadword( wstring & );
/// Strip DSL {{...}} comments
void stripComments( wstring &, bool & );
inline size_t DslScanner::distanceToBytes( size_t x ) const
{
switch( encoding )
{
case Utf16LE:
case Utf16BE:
return x*2;
default:
return x;
}
}
/// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
/// getLangTo()) to its proper language id.
quint32 dslLanguageToId( wstring const & name );
}
}
#endif