-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathhyphenation-fp.lisp
343 lines (310 loc) · 11.2 KB
/
hyphenation-fp.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Base: 10 -*-
;;; $Id: hyphen.lisp,v 1.1.1.1 2003/04/30 09:16:04 fabrice.popineau Exp $
;;; New implementation of TeX algorithms in Common Lisp by Fabrice Popineau
;;; See file LICENSE.TXT for the license terms.
;;; email: [email protected]
;;; snail: Fabrice Popineau
;;; Supelec
;;; 2 rue E. Belin
;;; F-57070 Metz France
;;; www: http://www.metz.supelec.fr/~popineau/
(defpackage :cl-typesetting-hyphen
(:use common-lisp)
(:nicknames :cl-tt-hyph)
(:export #:language-defined-p
#:define-language
#:language-loaded-p
#:load-language
#:language-hyphenation))
(in-package :cl-typesetting-hyphen)
(defvar *cl-typesetting-base-directory*
(make-pathname :name nil :type nil :version nil
:defaults #.(or #-gcl *compile-file-truename* *load-truename*))
"The base directory for cl-typesetting source and auxiliary data")
(defvar *hyphen-patterns-directory*
(merge-pathnames (make-pathname :name nil :type nil :version nil
:directory '(:relative "hyphen-patterns"))
*cl-typesetting-base-directory*))
(defvar *language-hyphen-file-list*
'((:british . "gbhyph")
(:german-new . "dehyphn")
(:german-std . "dehypht")
(:american . "ushyph")
(:united-kingdom . "ukhyph")
(:french . "frhyph")
(:foo . "foo"))
"An alist of supported languages, with CARs being language names and
CDRs naming files expected to be found in
`cl-typesetting-hyphen::*hyphen-patterns-directory*'.")
(defvar *language-trie-alist*
'()
"An alist of (LANGUAGE . HYPHEN-TRIE) pairs, used to store
hyphenation data for respective languages.")
;; An hyphenation object is able to return the list
;; of hyphenation points for any word according to
;; the language it has been built for
(defclass hyphen-trie ()
((language :initform :american :accessor language :initarg :language)
#+nil
(fn-find-hyphen-points :initform () :accessor fn-find-hyphen-points :initarg :fn-find-hyphen-points)
(pattern-trie :initform () :accessor pattern-trie :initarg :pattern-trie)
(exception-trie :initform () :accessor exception-trie :initarg :exception-trie)
)
)
(defmethod (setf pattern-trie) (value hyphen-trie)
(declare (ignore hyphen-trie))
value)
(defmethod (setf exception-trie) (value hyphen-trie)
(declare (ignore hyphen-trie))
value)
(defvar *left-hyphen-minimum* 2
"Minimum number of characters that must precede a hyphen.")
(defvar *right-hyphen-minimum* 3
"Minimum number of characters that must follow a hyphen.")
;; This will hold the lambda function which will execute the trie
(defmethod hyphen-find-hyphen-points (hyphen-trie word)
#+nil
(funcall (symbol-function (fn-find-hyphen-points hyphen-trie)) word)
(let* ((word-seq (coerce word 'list))
(word-length (length word))
(result (hyphen-trie-find-exception word-seq (exception-trie hyphen-trie))))
(unless result
(setq result (hyphen-trie-find `(#\. ,@word-seq #\.) (pattern-trie hyphen-trie))))
(mapcar #'first (remove-if
#'(lambda (x)
(let ((idx (first x)))
(or (eq idx :end)
(< idx *left-hyphen-minimum*)
(< (- word-length idx) *right-hyphen-minimum*))))
result))))
;; Format of the language file:
;; the first line has 'pattern'
;; one pattern per line
;; TeX notation ^^xy allowed
;; exceptions introduced by a line with 'exception'
;; Parse a pattern from the language file
(defun hyphen-parse-pattern-line (line)
(let ((char-list (coerce line 'list))
key value (position 0))
(loop for c = (pop char-list)
while c
do
(cond ((and (eq c #\^) (eq (pop char-list) #\^)) ;; FIXME against syntax errors
(let* ((c1 (digit-char-p (pop char-list) 16))
(c2 (digit-char-p (pop char-list) 16)))
(push (code-char (+ (* c1 16) c2))
key))
(incf position))
((digit-char-p c) (push (cons position (digit-char-p c)) value))
(t (push c key) (incf position))))
(append (reverse key) value)))
;; Parse an exception from the language file
(defun hyphen-parse-exception-line (line)
(let ((char-list (coerce line 'list))
key value (position 0))
(loop for c = (pop char-list)
while c
do
(cond ((eq c #\-) (push (cons position 1) value))
(t (push c key) (incf position))))
(append (reverse key) (or value (list (cons :end 1))))))
;; Build a trie out of a sorted list
;; of pairs (word, hyph-points)
;;
(defun hyphen-make-trie (list-of-list depth)
(when (first list-of-list)
(let (result
subresult
(prev_c (caar list-of-list)))
(loop for l = (pop list-of-list)
while l
do
(cond
((eq (first l) prev_c) (push (rest l) subresult))
(t
(cond ((characterp prev_c)
(push `(,prev_c ,@(hyphen-make-trie subresult (+ depth 1))) result))
(t
(push `(:pattern ,prev_c ,@(first subresult)) result)))
(setq prev_c (first l)
subresult (list (rest l)))))
finally
(progn
(if (characterp prev_c)
(push `(,prev_c ,@(hyphen-make-trie subresult (+ depth 1))) result)
(push `(:pattern ,prev_c ,@(first subresult)) result)))
)
result)
))
;; Find a word in an hyphenation trie
;;
;;
(defun hyphen-trie-find-aux (word-seq trie)
(when trie
(append
(rest (assoc :pattern trie))
(when word-seq
(hyphen-trie-find-aux (rest word-seq) (cdr (assoc (first word-seq) trie)))))))
(defun hyphen-trie-find (word-seq trie)
(let* ((pos -2))
(remove-if #'evenp
(remove-duplicates
(sort
(mapcon #'(lambda (x)
(incf pos)
(mapcar #'(lambda (y)
(cons (+ (first y) pos) (rest y)))
(hyphen-trie-find-aux x trie))) word-seq)
#'(lambda (x y) (or (< (first x) (first y))
(and (= (first x) (first y)) (< (rest x) (rest y)))))
)
:key #'first)
:key #'rest)))
;; Exceptions are a bit different
;; Either a word is an exception or not, but no patter to test
;;
(defun hyphen-trie-find-exception-aux (word-seq trie)
(when trie
(if word-seq
(hyphen-trie-find-exception-aux (rest word-seq) (cdr (assoc (first word-seq) trie)))
(rest (assoc :pattern trie)))))
(defun hyphen-trie-find-exception (word-seq trie)
(remove-duplicates
(sort
(hyphen-trie-find-exception-aux word-seq trie)
#'(lambda (x y) (or (< (first x) (first y))
(and (= (first x) (first y))
(< (rest x) (rest y)))))
)
:key #'first))
;;; Annoying, but Lispworks is not able to compile the resulting lambda
;;; under x86 architectures. Moreover, it is not clear if it is the most
;;; efficient approach.
#+nil
(defun hyphen-compile-patterns (patterns level pos)
(declare (optimize speed))
(declare (fixnum level pos))
(if (= level 0)
`(lambda (word)
(let (result
(word-seq (coerce word 'list))
)
(declare (fixnum pos))
(loop for pos from 0 to (- (length word-seq) 1) by 1
do
,(compile-hyphen-patterns patterns (+ 1 level) pos)
(pop word-seq)
)
result))
(let* ((pattern-contrib (remove-if #'(lambda (x) (characterp (first x))) patterns))
(trie-contrib (remove-if #'(lambda (x) (eq (first x) :pattern)) patterns))
(pattern-inst
(mapcar
#'(lambda (x)
`(push
(cons (+ ,(caadr x) pos) ,(cdadr x))
result))
pattern-contrib))
(trie-inst
(cond ((null trie-contrib) nil)
((null (rest trie-contrib))
`(when (char=
(nth ,(- level 1) word-seq)
,(first (first trie-contrib)))
,(hyphen-compile-patterns (rest (first trie-contrib)) (+ level 1) pos)))
(t
`(case (nth ,(- level 1) word-seq)
,@(mapcar #'(lambda (l)
(list (first l)
(hyphen-compile-patterns (rest l) (+ level 1) pos))) trie-contrib)))))
(inst (if (null pattern-inst)
(if (null (rest trie-inst))
(first trie-inst)
trie-inst)
(if (null trie-inst)
(if (null (rest pattern-inst))
(first pattern-inst)
pattern-inst)
`(,@pattern-inst ,trie-inst)))))
(if (and (atom (first inst)) (fboundp (first inst)))
inst
`(progn ,@inst)))))
(defun hyphen-cmp-char-lists (l1 l2)
(let (result done)
(loop for c1 = (pop l1)
for c2 = (pop l2)
while (and (characterp c1) (characterp c2) (not done))
do
(if (char< c1 c2)
(setq result t done t)
(if (char> c1 c2)
(setq done t)
))
finally (if done result nil))))
(defmethod read-hyphen-file (hyphen-trie)
(let ((filename (make-pathname :name (cdr (assoc (language hyphen-trie)
*language-hyphen-file-list*))
:type "txt" :version nil
:defaults *hyphen-patterns-directory*))
patterns exceptions count)
(with-open-file (input filename :external-format pdf::+external-format+)
(do ((line (read-line input nil nil))
mode)
((null line))
(cond ((search "pattern" line)
(setq mode :patterns count 0))
((search "exception" line)
(setq mode :exception count 0))
((eq mode :patterns)
(push (hyphen-parse-pattern-line line) patterns)
(incf count))
((eq mode :exception)
(push (hyphen-parse-exception-line line) exceptions)
(incf count))
(t ))
(setf line (read-line input nil nil))
)
)
(setq patterns (sort patterns #'hyphen-cmp-char-lists)
exceptions (sort exceptions #'hyphen-cmp-char-lists))
#+nil
(setq patterns (compile-hyphen-patterns (hyphen-make-trie patterns 0) 0 0))
;; Lispworks x86 is not able to compile a lambda of arbitrary size
#+nil
(progn
(harlequin-common-lisp::toggle-source-debugging nil)
(setf (symbol-function (fn-find-hyphen-points hyphen-trie))
(compile nil patterns)))
(setf (pattern-trie hyphen-trie) (hyphen-make-trie patterns 0))
(setf (exception-trie hyphen-trie) (hyphen-make-trie exceptions 0))))
(defun language-defined-p (lang-sym)
(assoc lang-sym *language-hyphen-file-list*))
(defun define-language (lang-sym hyphen-file)
(unless (language-defined-p lang-sym)
(push (cons lang-sym hyphen-file) *language-hyphen-file-list*)))
(defun language-loaded-p (lang-sym)
(assoc lang-sym *language-trie-alist*))
(defun load-language (lang-sym &optional force)
(cond ((not (language-defined-p lang-sym))
(error "Unsupported language ~S" lang-sym))
((or (not (language-loaded-p lang-sym))
force)
(let ((trie (make-instance 'hyphen-trie :language lang-sym)))
(push (cons lang-sym trie) *language-trie-alist*)
(read-hyphen-file trie)
trie))))
(defun language-hyphenation (lang-sym)
(let ((lang-defined (language-loaded-p lang-sym)))
(if (not lang-defined)
(error "Language not loaded, please use (LOAD-LANGUAGE ~S)" lang-sym)
(cdr lang-defined))))
;; XXX These are left for software that could refer directly to these
;; variables but no more languages should be added this way. Instead,
;; please use DEFINE-LANGUAGE and LOAD-LANGUAGE:
;;
;; (define-language :polish "plhyph")
;; (load-language :polish)
;;
;; Initialization performed in ZZINIT.LISP.
(defparameter *american-hyphen-trie* nil)
(defparameter *french-hyphen-trie* nil)