Tactful_Tokenizer/lib/tactful_tokenizer.rb at release · zencephalon/Tactful_Tokenizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# -*- encoding : utf-8 -*-
# TactfulTokenizer is a Ruby library for high quality sentence
# tokenization. It uses a Naive Bayesian statistical model, and
# is based on Splitta[http://code.google.com/p/splitta/]. But
# has support for '?' and '!' as well as primitive handling of
# XHTML markup. Better support for XHTML parsing is coming shortly.
#
# Example usage:
#
#  require "tactful_tokenizer"
#  m = TactfulTokenizer::Model.new
#  m.tokenize_text("Here in the U.S. Senate we prefer to eat our friends. Is it easier that way, really? Yes.")
#  #=> ["Here in the U.S. Senate we prefer to eat our friends.", "Is it easier that way, really?", "Yes."]
#
# The input text is expected to consist of paragraphs delimited
# by line breaks.
#
# Author:: Matthew Bunday (mailto:mkbunday@gmail.com)
# License:: GNU General Public License v3

# Performance TODOs.
# TODO: Use inline C where necessary?
# TODO: Use RE2 regexp extension.

# -*- encoding : utf-8 -*-
require "word_tokenizer.rb"
include WordTokenizer

module TactfulTokenizer

  # Basic String extensions.
  String.class_eval do

    # Simple regex to check if a string is alphabetic.
    def is_alphabetic?
      !/[[:lower:][:upper:][:space:]]+/u.match(self).nil?
    end

    # Check for upper case.
    # Surprisingly, this is faster than a regex in benchmarks.
    # Using the trinary operator is faster than to_s
    def is_upper_case?
      self == self.upcase
    end
  end

  # A model stores normalized probabilities of different features occuring.
  class Model

    # Initialize the model. feats, lower_words, and non_abbrs
    # indicate the locations of the respective Marshal dumps.
    def initialize(feats="#{File.dirname(__FILE__)}/models/features.mar", lower_words="#{File.dirname(__FILE__)}/models/lower_words.mar", non_abbrs="#{File.dirname(__FILE__)}/models/non_abbrs.mar")
      @feats, @lower_words, @non_abbrs = [feats, lower_words, non_abbrs].map do |file|
        File.open(file) do |f|
          Marshal.load(f.read)
        end
      end
      @p0 = @feats["<prior>"] ** 4
    end

    # feats = {feature => normalized probability of feature}
    # lower_words = {token => log count of occurences in lower case}
    # non_abbrs = {token => log count of occurences when not an abbrv.}
    attr_accessor :feats, :lower_words, :non_abbrs

    # This function is the only one that'll end up being used.
    # m = TactfulTokenizer::Model.new
    # m.tokenize_text("Hey, are these two sentences? I bet they should be.")
    # => ["Hey, are these two sentences?", "I bet they should be."]
    def tokenize_text(text)
      data = Doc.new(text)
      featurize(data)
      classify(data)
      return data.segment
    end

    # Assign a prediction (probability, to be precise) to each sentence fragment.
    # For each feature in each fragment we hunt up the normalized probability and
    # multiply. This is a fairly straightforward Bayesian probabilistic algorithm.
    def classify(doc)
      frag, probs, feat = nil, nil, nil
      doc.frags.each do |frag|
        probs = @p0
        frag.features.each do |feat|
          probs *= @feats[feat]
        end
        frag.pred = probs / (probs + 1)
      end
    end

    # Get the features of every fragment.
    def featurize(doc)
      frag = nil
      doc.frags.each do |frag|
        get_features(frag, self)
      end
    end

    # Finds the features in a text fragment of the form:
    # ... w1. (sb?) w2 ...
    # Features listed in rough order of importance:
    # * w1: a word that includes a period.
    # * w2: the next word, if it exists.
    # * w1length: the number of alphabetic characters in w1.
    # * both: w1 and w2 taken together.
    # * w1abbr: logarithmic count of w1 occuring without a period.
    # * w2lower: logarithmiccount of w2 occuring lowercased.
    def get_features(frag, model)
      w1 = (frag.cleaned.last or '')
      w2 = (frag.next or '')

      frag.features = ["w1_#{w1}", "w2_#{w2}", "both_#{w1}_#{w2}"]

      unless w2.empty?
        frag.push_w1_features(w1, model)
        frag.push_w2_features(w2, model)
      end
    end
  end

  # A document represents the input text. It holds a list of fragments generated
  # from the text.
  class Doc
    # List of fragments.
    attr_accessor :frags

    # Receives a text, which is then broken into fragments.
    # A fragment ends with a period, quesetion mark, or exclamation mark followed
    # possibly by right handed punctuation like quotation marks or closing braces
    # and trailing whitespace. Failing that, it'll accept something like "I hate cheese\n"
    # No, it doesn't have a period, but that's the end of paragraph.
    #
    # Input assumption: Paragraphs delimited by line breaks.
    def initialize(text)
      @frags = []
      res = nil
      text.each_line do |line|
        unless line.strip.empty?
          line.split(/(.*?[.!?](?:[”"')\]}]|(?:<.*>))*[[:space:]])/u).each do |res|
            unless res.strip.empty?
              frag = Frag.new(res)
              @frags.last.next = frag.cleaned.first unless @frags.empty?
              @frags.push frag
            end
          end
        end
      end
    end

    # Segments the text. More precisely, it reassembles the fragments into sentences.
    # We call something a sentence whenever it is more likely to be a sentence than not.
    def segment
      sents, sent = [], []
      thresh = 0.5

      frag = nil
      @frags.each do |frag|
        sent.push(frag.orig)
        if frag.pred && frag.pred > thresh
          break if frag.orig.nil?
          sents.push(sent.join('').strip)
          sent = []
        end
      end
      sents
    end
  end

  # A fragment is a potential sentence, but is based only on the existence of a period.
  # The text "Here in the U.S. Senate we prefer to devour our friends." will be split
  # into "Here in the U.S." and "Senate we prefer to devour our friends."
  class Frag

    # orig = The original text of the fragment.
    # next = The next word following the fragment.
    # cleaned = Array of the fragment's words after cleaning.
    # pred = Probability that the fragment is a sentence.
    # features = Array of the fragment's features.
    attr_accessor :orig, :next, :cleaned, :pred, :features

    # Create a new fragment.
    def initialize(orig='')
      @orig = orig
      clean(orig)
      @next, @pred, @features = nil, nil, nil
    end

    # Normalizes numbers and discards ambiguous punctuation. And then splits into an
    # array, because realistically only the last and first words are ever accessed.
    def clean(s)
      @cleaned = String.new(s)
      tokenize(@cleaned)
      @cleaned.gsub!(/[.,\d]*\d/, '<NUM>')
      @cleaned.gsub!(/[^[[:upper:][:lower:]]\d[:space:],!?.;:<>\-'\/$% ]/u, '')
      @cleaned.gsub!('--', ' ')
      @cleaned = @cleaned.split
    end

    def push_w1_features w1, model
      if w1.chop.is_alphabetic?
        features.push "w1length_#{[10, w1.length].min}", "w1abbr_#{model.non_abbrs[w1.chop]}"
      end
    end

    def push_w2_features w2, model
      if w2.chop.is_alphabetic?
        features.push "w2cap_#{w2[0,1].is_upper_case?}", "w2lower_#{model.lower_words[w2.downcase]}"
      end
    end
  end
end