Skip to content

Commit

Permalink
Fixed numerous pip distribution errors
Browse files Browse the repository at this point in the history
  • Loading branch information
MSeal committed Jun 12, 2014
1 parent 2a61364 commit afac20e
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 128 deletions.
Empty file added .pydistutils.cfg
Empty file.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ All unit tests for the repo.
## TODO
* Add mthreads alongside pthreads for bulk operations
* Fix blocking issues for Windows usage (see above)
* Remove cacheman dependency

## Author
Author(s): Tim Rodriguez and Matthew Seal
Expand Down
249 changes: 133 additions & 116 deletions hunspell/hunspell.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import runtimedir
from cacheman.cachewrap import NonPersistentCache
from cacheman.cacher import get_cache_manager
from cacheman.autosync import TimeCount, AutoSyncCache
Expand Down Expand Up @@ -126,20 +125,21 @@ cdef class HunspellWrap(object):

pyaffpath = os.path.join(self._hunspell_dir, '{}.aff'.format(lang))
pydpath = os.path.join(self._hunspell_dir, '{}.dic'.format(lang))
if (copy_to_c_string(pyaffpath, &self.affpath) <= 0 or
copy_to_c_string(pydpath, &self.dpath) <= 0):
raise MemoryError()
for fpath in (pyaffpath, pydpath):
if not os.path.isfile(fpath) or not os.access(fpath, os.R_OK):
raise IOError("File '{}' not found or accessible".format(fpath))

if (copy_to_c_string(pyaffpath, &self.affpath) <= 0 or
copy_to_c_string(pydpath, &self.dpath) <= 0):
raise MemoryError()
holder = new Hunspell(self.affpath, self.dpath)
if holder is NULL:
raise MemoryError()

return holder

# C-realm Constructor
def __init__(self, basestring lang, basestring cache_manager="hunspell", bint use_disk_cache=False,
def __init__(self, basestring lang, basestring cache_manager="hunspell", basestring disk_cache_dir=None,
basestring hunspell_data_dir=None):
# TODO - make these LRU caches so that you don't destroy your memory!
if hunspell_data_dir is None:
Expand All @@ -154,14 +154,16 @@ cdef class HunspellWrap(object):

self._cache_manager_name = cache_manager
manager = get_cache_manager(self._cache_manager_name)
if disk_cache_dir:
manager.cache_directory = disk_cache_dir
if not manager.cache_registered("hunspell_suggest"):
if use_disk_cache:
if disk_cache_dir:
custom_time_checks = [TimeCount(60, 1000000), TimeCount(300, 10000), TimeCount(900, 1)]
AutoSyncCache("hunspell_suggest", cache_manager=manager, time_checks=custom_time_checks)
else:
NonPersistentCache("hunspell_suggest", cache_manager=manager)
if not manager.cache_registered("hunspell_stem"):
if use_disk_cache:
if disk_cache_dir:
custom_time_checks = [TimeCount(60, 1000000), TimeCount(300, 10000), TimeCount(900, 1)]
AutoSyncCache("hunspell_stem", cache_manager=manager, time_checks=custom_time_checks)
else:
Expand All @@ -180,49 +182,61 @@ cdef class HunspellWrap(object):
cdef char *c_word = NULL
if copy_to_c_string(word, &c_word) <= 0:
raise MemoryError()
ret = self._cxx_hunspell.spell(c_word)
free(c_word)
return ret != 0

try:
return self._cxx_hunspell.spell(c_word) != 0
finally:
free(c_word)

# Python individual word suggestions
def suggest(self, basestring word):
if word in self._suggest_cache:
return self._suggest_cache[word]

cdef char **s_list = NULL
cdef char *c_word = NULL
if copy_to_c_string(word, &c_word) <= 0:
raise MemoryError()
cdef char **s_list
count = self._cxx_hunspell.suggest(&s_list, c_word)
suggestion_list = []

for i from 0 <= i < count:
suggestion_list.append(c_string_to_unicode_no_except(s_list[i]))

self._cxx_hunspell.free_list(&s_list, count)
free(c_word)
self._suggest_cache[word] = list(suggestion_list)
return suggestion_list
try:
count = self._cxx_hunspell.suggest(&s_list, c_word)
try:
suggestion_list = []
for i from 0 <= i < count:
suggestion_list.append(c_string_to_unicode_no_except(s_list[i]))

suggestion_list = tuple(suggestion_list)
self._suggest_cache[word] = suggestion_list
return suggestion_list
finally:
self._cxx_hunspell.free_list(&s_list, count)
finally:
free(c_word)

# Python individual word stemming
def stem(self, basestring word):
if word in self._stem_cache:
return self._stem_cache[word]

cdef char **s_list = NULL
cdef char *c_word = NULL
if copy_to_c_string(word, &c_word) <= 0:
raise MemoryError()
cdef char **s_list
count = self._cxx_hunspell.stem(&s_list, c_word)
stem_list = []

for i from 0 <= i < count:
stem_list.append(c_string_to_unicode_no_except(s_list[i]))

self._cxx_hunspell.free_list(&s_list, count)
free(c_word)
self._stem_cache[word] = list(stem_list)
return stem_list
try:
count = self._cxx_hunspell.stem(&s_list, c_word)
try:
stem_list = []
for i from 0 <= i < count:
stem_list.append(c_string_to_unicode_no_except(s_list[i]))

stem_list = tuple(stem_list)
self._stem_cache[word] = stem_list
return stem_list
finally:
self._cxx_hunspell.free_list(&s_list, count)
finally:
free(c_word)

def save_cache(self):
get_cache_manager(self._cache_manager_name).save_all_cache_contents()
Expand All @@ -241,70 +255,71 @@ cdef class HunspellWrap(object):

# Allocate all memory per thread
thread_args = <ThreadWorkerArgs *>calloc(self.n_cpus, sizeof(ThreadWorkerArgs))
if thread_args is NULL:
raise MemoryError()

threads = <pthread_t *>calloc(self.n_cpus, sizeof(pthread_t))
if threads is NULL:
if thread_args is NULL or threads is NULL:
raise MemoryError()

# Divide workload between threads
words_per_thread = n_words / self.n_cpus
words_distributed = 0
# If uneven, round down on workers per thread (but the last thread will have extra work to do)
if n_words % self.n_cpus != 0:
words_per_thread = (n_words - (n_words % self.n_cpus)) / self.n_cpus
try:
# Divide workload between threads
words_per_thread = n_words / self.n_cpus
words_distributed = 0
# If uneven, round down on workers per thread (but the last thread will have extra work to do)
if n_words % self.n_cpus != 0:
words_per_thread = (n_words - (n_words % self.n_cpus)) / self.n_cpus

for i from 0 <= i < self.n_cpus:
stride = i * words_per_thread
thread_args[i].tid = i
for i from 0 <= i < self.n_cpus:
stride = i * words_per_thread
thread_args[i].tid = i

# Allocate one Hunspell Dict per thread since it isn't safe.
thread_args[i].hspell = self._create_hspell_inst(self.lang)
# Allocate one Hunspell Dict per thread since it isn't safe.
thread_args[i].hspell = self._create_hspell_inst(self.lang)

# Account for leftovers
if i == self.n_cpus - 1:
thread_args[i].n_words = n_words - words_distributed
else:
thread_args[i].n_words = words_per_thread
words_distributed += words_per_thread

# Find the stride into each array
thread_args[i].word_list = &word_array[stride]
thread_args[i].output_array_ptr = &output_array[stride]
thread_args[i].output_counts = &output_counts[stride]

# Create thread
if action == "stem":
rc = pthread_create(&threads[i], NULL, hunspell_stem_worker, <void *> &thread_args[i])
else: # suggest
rc = pthread_create(&threads[i], NULL, hunspell_suggest_worker, <void *> &thread_args[i])
if rc:
raise OSError(rc, "Could not create pthread")

# wait for each thread to complete
for i from 0 <= i < self.n_cpus:
# block until thread i completes
rc = pthread_join(threads[i], NULL)
if rc:
raise OSError(rc, "Could not join pthread")

# Free Hunspell Dict
del thread_args[i].hspell

# Free top level stuff
free(thread_args)
free(threads)
return 1
# Account for leftovers
if i == self.n_cpus - 1:
thread_args[i].n_words = n_words - words_distributed
else:
thread_args[i].n_words = words_per_thread
words_distributed += words_per_thread

# Find the stride into each array
thread_args[i].word_list = &word_array[stride]
thread_args[i].output_array_ptr = &output_array[stride]
thread_args[i].output_counts = &output_counts[stride]

# Create thread
if action == "stem":
rc = pthread_create(&threads[i], NULL, hunspell_stem_worker, <void *> &thread_args[i])
else: # suggest
rc = pthread_create(&threads[i], NULL, hunspell_suggest_worker, <void *> &thread_args[i])
if rc:
raise OSError(rc, "Could not create pthread")

# wait for each thread to complete
for i from 0 <= i < self.n_cpus:
# block until thread i completes
rc = pthread_join(threads[i], NULL)
if rc:
raise OSError(rc, "Could not join pthread")

# Free Hunspell Dict
del thread_args[i].hspell
return 1
finally:
# Free top level stuff
free(thread_args)
free(threads)

# Parse the return of a bulk action
cdef void _parse_bulk_results(self, dict ret_dict, list unknown_words, int *output_counts, char ***output_array) except +:
cdef int i, j
for i from 0 <= i < len(unknown_words):
for j from 0 <= j < output_counts[i]:
ret_dict[unknown_words[i]].append(c_string_to_unicode_no_except(output_array[i][j]))
# Free each suggestion list
self._cxx_hunspell.free_list(output_array + i, output_counts[i])
try:
for i from 0 <= i < len(unknown_words):
for j from 0 <= j < output_counts[i]:
ret_dict[unknown_words[i]].append(c_string_to_unicode_no_except(output_array[i][j]))
finally:
for i from 0 <= i < len(unknown_words):
# Free each suggestion list
self._cxx_hunspell.free_list(output_array + i, output_counts[i])

#
# Python API - Accepts a list of words, returns a dict of words mapped to a list of their hunspell suggestions
Expand Down Expand Up @@ -337,45 +352,47 @@ cdef class HunspellWrap(object):

# Initialize C word list
# C version of: ["foo", "bar", "baz"]
cdef char ***output_array = NULL
cdef int *output_counts = NULL
cdef char **word_array = <char **>calloc(len(unknown_words), sizeof(char *))
if word_array is NULL:
raise MemoryError()
for i, unknown_word in enumerate(unknown_words):
if copy_to_c_string(unknown_word, &word_array[i]) <= 0:
raise MemoryError()

# Create output arrays
# Array of arrays of C strings (e.g. [["food", ...], ["bar"], ["bad", ...]])
# This array will be divided evenly amongst the threads for the return values
# of Hunspell.suggest(), each call returns an array of C strings
cdef char ***output_array = <char ***>calloc(len(unknown_words), sizeof(char **))
if output_array is NULL:
raise MemoryError()

# Array of integers, each the length of the corresponding C string array
# This array will be divided evenly amongst the threads for the length of the
# arrays returned by each call to Hunspell.suggest()
cdef int *output_counts = <int *>calloc(len(unknown_words), sizeof(int))
if output_counts is NULL:
raise MemoryError()

# Schedule bulk job
self._c_bulk_action(action, word_array, output_array, len(unknown_words), output_counts)

# Parse the return
self._parse_bulk_results(ret_dict, unknown_words, output_counts, output_array)
try:
# Create output arrays
# Array of arrays of C strings (e.g. [["food", ...], ["bar"], ["bad", ...]])
# This array will be divided evenly amongst the threads for the return values
# of Hunspell.suggest(), each call returns an array of C strings
output_array = <char ***>calloc(len(unknown_words), sizeof(char **))

# Array of integers, each the length of the corresponding C string array
# This array will be divided evenly amongst the threads for the length of the
# arrays returned by each call to Hunspell.suggest()
output_counts = <int *>calloc(len(unknown_words), sizeof(int))
if output_counts is NULL or output_array is NULL:
raise MemoryError()

# Add ret_dict words to cache
if action == "stem":
for i from 0 <= i < len(unknown_words):
self._stem_cache[unknown_words[i]] = ret_dict[unknown_words[i]]
else:
for i from 0 <= i < len(unknown_words):
self._suggest_cache[unknown_words[i]] = ret_dict[unknown_words[i]]
try:
# Schedule bulk job
self._c_bulk_action(action, word_array, output_array, len(unknown_words), output_counts)

# Free top level stuff
free(output_array)
free(output_counts)
self._cxx_hunspell.free_list(&word_array, len(unknown_words))
# Parse the return
self._parse_bulk_results(ret_dict, unknown_words, output_counts, output_array)

return ret_dict
# Add ret_dict words to cache
if action == "stem":
for i from 0 <= i < len(unknown_words):
self._stem_cache[unknown_words[i]] = ret_dict[unknown_words[i]]
else:
for i from 0 <= i < len(unknown_words):
self._suggest_cache[unknown_words[i]] = ret_dict[unknown_words[i]]
return ret_dict
finally:
# Free top level stuff
free(output_array)
free(output_counts)
finally:
self._cxx_hunspell.free_list(&word_array, len(unknown_words))
Loading

0 comments on commit afac20e

Please sign in to comment.