diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6befd53 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +src/packages/ +src/Skybrud.TextAnalysis/bin/ +src/Skybrud.TextAnalysis/obj/ +src/.vs/ diff --git a/src/Skybrud.TextAnalysis.sln b/src/Skybrud.TextAnalysis.sln new file mode 100644 index 0000000..0c53c9b --- /dev/null +++ b/src/Skybrud.TextAnalysis.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29806.167 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Skybrud.TextAnalysis", "Skybrud.TextAnalysis\Skybrud.TextAnalysis.csproj", "{1D28A1E8-826F-4A30-AF13-944A1365A5B1}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {1D28A1E8-826F-4A30-AF13-944A1365A5B1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1D28A1E8-826F-4A30-AF13-944A1365A5B1}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1D28A1E8-826F-4A30-AF13-944A1365A5B1}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1D28A1E8-826F-4A30-AF13-944A1365A5B1}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {0A8CE6D2-0910-4D2F-BA5B-09F729847D33} + EndGlobalSection +EndGlobal diff --git a/src/Skybrud.TextAnalysis/Hunspell/Affix.cs b/src/Skybrud.TextAnalysis/Hunspell/Affix.cs new file mode 100644 index 0000000..125d072 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/Affix.cs @@ -0,0 +1,49 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class Affix { + + public Dictionary SuffixRuleSets { get; } + + private Affix(string raw) { + + string[] lines = raw.Split('\n'); + + SuffixRuleSets = new Dictionary(); + + for (int i = 0; i < lines.Length; i++) { + + if (lines[i].Length == 0 || lines[i][0] == '#') continue; + + string last = i == 0 ? null : lines[i - 1]; + + if (lines[i].StartsWith("SFX ")) { + + string[] pieces = lines[i].Split(' ', '\t'); + + string name = pieces[1]; + string comment = last != null && last.StartsWith("#") ? last.Substring(1).Trim() : null; + int count = int.Parse(pieces[3]); + string[] rules = lines.Skip(i + 1).Take(count).ToArray(); + + var set = new SuffixRuleSet(name, comment, rules); + SuffixRuleSets.Add(set.Name, set); + + i += count; + + } + + } + + } + + public static Affix Load(string path) { + return new Affix(File.ReadAllText(path)); + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/HunspellTextAnalyzer.cs b/src/Skybrud.TextAnalysis/Hunspell/HunspellTextAnalyzer.cs new file mode 100644 index 0000000..11ab517 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/HunspellTextAnalyzer.cs @@ -0,0 +1,382 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Skybrud.TextAnalysis.Hunspell; +using Skybrud.TextAnalysis.Search; + +namespace Skybrud.TextAnalysis { + + public class HunspellTextAnalyzer { + + #region Properties + + public NHunspell.Hunspell Hunspell { get; } + + public Affix Affix { get; } + + public MyDictionary Dictionary { get; } + + #endregion + + #region Constructors + + public HunspellTextAnalyzer(NHunspell.Hunspell hunspell, Affix affix, MyDictionary dictionary) { + Hunspell = hunspell; + Affix = affix; + Dictionary = dictionary; + } + + #endregion + + #region Member methods + + /// + /// Returns whether the specified is spelled correctly. Same as calling the + /// method directly. + /// + /// The word to check. + /// + public bool Spell(string word) { + return Hunspell.Spell(word); + } + + /// + /// Gets an array of stems for the specified . + /// + /// The word to get the stem(s) for. + /// An array of . + /// This method is similar to the method, but differs in the way + /// that has better support for working with compound words. For instance in Danish, the stem of + /// webredaktør is redaktør because webredaktør isn't in the dictionary. And if we try to + /// morph the stem, we get variations of redaktør instead of webredaktør. When morphing an + /// instance of , the prefix (if any) is kept in the morphed variations. + /// + public StemResult[] Stem(string word) { + + List temp = new List(); + + foreach (string stem in Hunspell.Stem(word)) { + int pos = word.IndexOf(stem, StringComparison.InvariantCultureIgnoreCase); + temp.Add(new StemResult(stem, pos > 0 ? word.Substring(0, pos) : null)); + } + + return temp.ToArray(); + + } + + /// + /// Returns an array of morphed variations of the specified word. + /// + /// The stem word to morph. + /// An array of the morphed variations. + public string[] Morph(StemResult stem) { + + List temp = new List(); + + if (Dictionary.TryGet(stem.Stem, out List list)) { + foreach (MyDictionaryItem item in list) { + foreach (Variant variant in item.Variants) { + temp.Add(stem.Prefix + variant.Value); + } + } + } + + return temp.ToArray(); + + } + + public string[] Suggest(string word) { + return Hunspell.Suggest(word).ToArray(); + } + + public TextExtendResult Extend(string text) { + + // Split the text query into multiple pieces so we can analyze each word separately + string[] pieces = text.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + + AndList query = new AndList(); + + List> temp1 = new List>(); + + for (int i = 0; i < pieces.Length; i++) { + + List temp2 = new List(); + temp1.Add(temp2); + + + string piece = pieces[i]; + + temp2.Add(new Word(WordType.Input, piece)); + + OrList or = new OrList { Name = "O0" }; + query.Query.Add(or); + + if (IsAlpha(piece) == false) { + or.Append(piece); + continue; + } + + if (Hunspell.Spell(piece)) { + + if (i < pieces.Length - 1) { + + string x = piece; + string y = pieces[i + 1]; + string z = x + y; + + bool ignore = false; + + switch (x) { + case "med": + case "mens": + ignore = true; + break; + } + + switch (y) { + case "med": + case "mens": + ignore = true; + break; + } + + if (ignore == false && Hunspell.Spell(z)) { + + AndList a2 = new AndList { Name = "A2" }; + + if (Hunspell.Spell(x)) { + OrList or1 = new OrList { Name = "O1" }; + foreach (var stem in Hunspell.Stem(x)) { + if (Dictionary.TryGet(stem, out var list)) { + foreach (MyDictionaryItem item in list) { + foreach (Variant variant in item.Variants) { + or1.Append(variant.Value); + } + } + } + else { + or1.Append(stem); + } + } + a2.Query.Add(or1); + } else { + a2.Append(x); + } + + if (Hunspell.Spell(y)) { + OrList or2 = new OrList { Name = "O2" }; + foreach (var stem in Hunspell.Stem(y)) { + if (Dictionary.TryGet(stem, out var list)) { + foreach (MyDictionaryItem item in list) { + foreach (Variant variant in item.Variants) { + or2.Append(variant.Value); + } + } + } + else { + or2.Append(stem); + } + } + a2.Query.Add(or2); + } else { + a2.Append(y); + } + + or.Query.Add(a2); + + foreach (string stem in Hunspell.Stem(z)) { + temp2.Add(Word.Suggestion(stem, z)); + OrList or3 = new OrList { Name = "O3" }; + if (Dictionary.TryGet(stem, out var items)) { + foreach (MyDictionaryItem item in items) { + foreach (Variant variant in item.Variants) { + or3.Append(variant.Value); + } + } + } + if (or3.Count > 0) or.Query.Add(or3); + } + + i++; + + continue; + + } + + } + + OrList or4 = new OrList { Name = "O4" }; + + foreach (string stem in Hunspell.Stem(piece)) { + + // Append the stem if it isn't equal to the input + if (piece != stem) temp2.Add(Word.Stem(stem, piece)); + + // Lookup the stem in the custom dictionary + if (Dictionary.TryGet(stem, out var items)) { + foreach (MyDictionaryItem item in items) { + foreach (Variant variant in item.Variants) { + or4.Append(variant.Value); + } + } + } + + // Fallback: append the stem if it wasn't found in the custom dictionary + or4.Append(stem); + + } + + // Append the OR list to the parent list if not empty + if (or4.Count > 0) { + or4.Query = or4.Query.Distinct().ToList(); + or.Query.Add(or4); + } + + } else { + + temp2[0].IsMisspelled = true; + + if (i < pieces.Length - 1) { + + string x = piece; + string y = pieces[i + 1]; + string z = x + y; + + if (Hunspell.Spell(z)) { + + AndList and = new AndList(); + or.Query.Add(and); + + and.Append(x); + + if (Hunspell.Spell(y)) { + OrList or2 = new OrList(); + and.Query.Add(or2); + foreach (string stem in Hunspell.Stem(y)) { + if (Dictionary.TryGet(stem, out var list)) { + foreach (MyDictionaryItem item in list) { + foreach (Variant variant in item.Variants) { + or2.Append(variant.Value); + } + } + } + else { + or2.Append(stem); + } + } + } else { + and.Append(y); + } + + OrList or5 = new OrList(); + foreach (string stem in Hunspell.Stem(z)) { + if (z != stem) temp2.Add(Word.Stem(stem, z)); + if (Dictionary.TryGet(stem, out var items)) { + foreach (MyDictionaryItem item in items) { + foreach (Variant variant in item.Variants) { + or5.Append(variant.Value); + } + } + } + } + + if (or5.Query.Any()) { + or5.Query = or5.Query.Distinct().ToList(); + or.Query.Add(or5); + } + + i++; + + continue; + + } + + } + + or.Append(piece); + + foreach (string suggestion in Hunspell.Suggest(piece)) { + temp2.Add(new Word(WordType.Suggestion, suggestion)); + or.Append(suggestion); + + foreach (string stem in Hunspell.Stem(suggestion)) { + if (Dictionary.TryGet(stem, out var list)) { + foreach (MyDictionaryItem item in list) { + foreach (Variant variant in item.Variants) { + or.Append(variant.Value); + } + } + } else { + or.Append(stem); + } + } + + } + + //if (i < pieces.Length - 1) { + // foreach (var suggestion in hunspell.Suggest(piece + " " + pieces[i + 1])) { + // temp2.Add(new Word(WordType.Suggestion, suggestion)); + // } + //} + + } + + } + + return new TextExtendResult(temp1.Select(x => x.ToArray()).ToArray(), query); + + } + + /// + /// Computes the Levenshtein distance between two strings. + /// + /// + public int Levenshtein(string s, string t) { + + int n = s.Length; + int m = t.Length; + int[,] d = new int[n + 1, m + 1]; + + // Step 1 + if (n == 0) { + return m; + } + + if (m == 0) { + return n; + } + + // Step 2 + for (int i = 0; i <= n; d[i, 0] = i++) { + } + + for (int j = 0; j <= m; d[0, j] = j++) { + } + + // Step 3 + for (int i = 1; i <= n; i++) { + //Step 4 + for (int j = 1; j <= m; j++) { + // Step 5 + int cost = (t[j - 1] == s[i - 1]) ? 0 : 1; + + // Step 6 + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + // Step 7 + return d[n, m]; + + } + + private bool IsAlpha(string value) { + return value?.All(char.IsLetter) ?? false; + } + + #endregion + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/MyDictionary.cs b/src/Skybrud.TextAnalysis/Hunspell/MyDictionary.cs new file mode 100644 index 0000000..9711c71 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/MyDictionary.cs @@ -0,0 +1,90 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class MyDictionary { + + #region Properties + + /// + /// Gets a reference to the internal dictionary. + /// + public Dictionary> Dictionary { get; } + + #endregion + + #region Constructors + + private MyDictionary(Dictionary> dictionary) { + Dictionary = dictionary; + } + + private MyDictionary(MyDictionaryItem[] array) { + + Dictionary = new Dictionary>(); + + foreach (MyDictionaryItem item in array) { + + if (Dictionary.TryGetValue(item.Stem, out List list) == false) { + list = new List(); + Dictionary.Add(item.Stem, list); + } + + list.Add(item); + + } + + } + + #endregion + + #region Member methods + + public bool TryGet(string stem, out List list) { + return Dictionary.TryGetValue(stem, out list); + } + + public static MyDictionary Load(string path, Affix affix) { + + Dictionary> temp = new Dictionary>(); + + foreach (string line in File.ReadAllLines(path).Skip(1)) { + + if (line == "lægge, lægger, lægges, lagde, lagdes, læggende, læggendes, lagt, lagts, lagte, lagtes, læg") continue; + + string[] hest = line.Split('/', ','); + + try { + + MyDictionaryItem item = new MyDictionaryItem(hest[0], hest.Skip(1).Select(int.Parse).ToArray(), affix); + + // Sammensætning, fugeelement + if (item.Flags.Length > 0 && item.Flags[0] == 941) continue; + + if (temp.TryGetValue(item.Stem, out List list) == false) { + list = new List(); + temp.Add(item.Stem, list); + } + + list.Add(item); + + } catch (Exception ex) { + + throw new Exception("Unable to parse line: " + line, ex); + + } + + } + + return new MyDictionary(temp); + + } + + #endregion + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItem.cs b/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItem.cs new file mode 100644 index 0000000..b1d7e5c --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItem.cs @@ -0,0 +1,94 @@ +using System.Collections.Generic; +using System.Linq; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class MyDictionaryItem { + + private MyDictionaryItemType _type; + private List _variants; + + #region Properties + + public string Stem { get; } + + public int[] Flags { get; } + + /// + /// Gets the most likely type of the item. + /// + public MyDictionaryItemType Type { + get { + if (_variants == null) Init(); + return _type; + } + } + + public List Variants { + get { + if (_variants == null) Init(); + return _variants; + } + } + + protected Affix Affix { get; } + + #endregion + + public MyDictionaryItem(string stem, int[] flags, Affix affix) { + Stem = stem; + Flags = flags; + Affix = affix; + } + + private void Init() { + + _variants = new List(); + + _type = MyDictionaryItemType.Ukendt; + + _variants.Add(new Variant(Stem, string.Empty)); + + if (Flags.Contains(55)) _type = MyDictionaryItemType.Proprium; + + if (Flags.Contains(6)) _type = MyDictionaryItemType.Verbum; + if (Flags.Contains(140)) _type = MyDictionaryItemType.Verbum; + if (Flags.Contains(143)) _type = MyDictionaryItemType.Verbum; + if (Flags.Contains(148)) _type = MyDictionaryItemType.Verbum; + + if (Flags.Contains(46)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(73)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(193)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(194)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(252)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(254)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(736)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(737)) _type = MyDictionaryItemType.Substantiv; + if (Flags.Contains(815)) _type = MyDictionaryItemType.Substantiv; + + foreach (int flag in Flags) { + if (Affix.SuffixRuleSets.TryGetValue(flag.ToString(), out SuffixRuleSet ruleSet)) { + ruleSet.Process(this); + } + } + + } + + public void AddVariant(string value, string comment) { + + _variants.Add(new Variant(value, comment)); + + switch ((comment ?? string.Empty).Split('+').Last()) { + case "GENITIV": + case "PLUR_BEK": + case "PLUR_UBEK": + case "BESTEMT_ENTAL": + _type = MyDictionaryItemType.Substantiv; + break; + } + + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItemType.cs b/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItemType.cs new file mode 100644 index 0000000..aa710f5 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/MyDictionaryItemType.cs @@ -0,0 +1,10 @@ +namespace Skybrud.TextAnalysis.Hunspell { + + public enum MyDictionaryItemType { + Ukendt, + Substantiv, + Verbum, + Proprium + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/StemResult.cs b/src/Skybrud.TextAnalysis/Hunspell/StemResult.cs new file mode 100644 index 0000000..e4362c7 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/StemResult.cs @@ -0,0 +1,16 @@ +namespace Skybrud.TextAnalysis.Hunspell { + + public class StemResult { + + public string Prefix { get; } + + public string Stem { get; } + + public StemResult(string stem, string prefix) { + Prefix = prefix ?? string.Empty; + Stem = stem; + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/SuffixRule.cs b/src/Skybrud.TextAnalysis/Hunspell/SuffixRule.cs new file mode 100644 index 0000000..123a5b3 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/SuffixRule.cs @@ -0,0 +1,108 @@ +using System.Linq; +using System.Text.RegularExpressions; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class SuffixRule { + + public string Raw { get; } + + public string Value1 { get; } + + public string Value2 { get; } + + public string Value3 { get; } + + public string Comment { get; } + + public SuffixRule(string rule) { + + Raw = rule; + + string[] pieces = rule.Split(' ', '\t'); + + Value1 = pieces[2]; + Value2 = pieces[3]; + Value3 = pieces[4]; + Comment = pieces.Length >= 6 ? pieces[5] : null; + + } + + public void Process(MyDictionaryItem item) { + + Match m1 = Regex.Match(Value3, "^([0-9a-zA-Z]+)$"); + Match m2 = Regex.Match(Value3, "^\\[([0-9a-zA-Z]+)\\]$"); + Match m3 = Regex.Match(Value3, "^\\[\\^([0-9a-zA-Z]+)\\]$"); + + Match m4 = Regex.Match(Value3, "^([0-9a-zA-Z]+)$"); + + Match m5 = Regex.Match(Value3, "^\\[\\^([0-9a-zA-Z]+)\\]([a-z]+)$"); + + string suffix = Value2.Split('/')[0]; + + if (suffix == "0") return; + + string comment = Raw; + + if (Value3 == ".") { + item.AddVariant(item.Stem + Value2.Split('/')[0], comment); + return; + } + + if (m1.Success) { + comment += " M1"; + if (item.Stem.EndsWith(m1.Groups[1].Value)) { + item.AddVariant(item.Stem + Value2.Split('/')[0], comment); + } + return; + } + + if (m2.Success) { + comment += " M2"; + char[] allowed = m2.Groups[1].Value.ToArray(); + if (allowed.Contains(item.Stem[item.Stem.Length - 1])) { + item.AddVariant(item.Stem + Value2.Split('/')[0], comment); + } + return; + } + + if (m3.Success) { + comment += " M3"; + char[] disallowed = m3.Groups[1].Value.ToArray(); + if (disallowed.All(x => x != item.Stem[item.Stem.Length - 1])) { + item.AddVariant(item.Stem + Value2.Split('/')[0], comment); + } + return; + } + + if (m4.Success) { + comment += " M4"; + if (item.Stem.EndsWith(m4.Groups[1].Value)) { + item.AddVariant(item.Stem + Value2.Split('/')[0], comment); + } + return; + } + + if (m5.Success) { + + comment += " M5"; + + char[] range = m5.Groups[1].Value.ToArray(); + + int pos = item.Stem.Length - m5.Groups[2].Value.Length - 1; + + // If the first value is different from "0", it means that we should remove the value from the end of the stem + string stem = Value1 == "0" ? item.Stem : item.Stem.Substring(0, item.Stem.Length - Value1.Length); + + if (item.Stem.EndsWith(m5.Groups[2].Value) && range.Contains(item.Stem[pos]) == false) { + item.AddVariant(stem + Value2.Split('/')[0], comment); + } + + return; + + } + + } + + } +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/SuffixRuleSet.cs b/src/Skybrud.TextAnalysis/Hunspell/SuffixRuleSet.cs new file mode 100644 index 0000000..6ffa143 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/SuffixRuleSet.cs @@ -0,0 +1,27 @@ +using System.Linq; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class SuffixRuleSet { + + public string Name { get; } + + public string Comment { get; } + + public SuffixRule[] Rules { get; } + + public SuffixRuleSet(string name, string comment, string[] rules) { + Name = name; + Comment = comment; + Rules = rules.Select(x => new SuffixRule(x)).ToArray(); + } + + public void Process(MyDictionaryItem item) { + foreach (SuffixRule rule in Rules) { + rule.Process(item); + } + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/TextExtendResult.cs b/src/Skybrud.TextAnalysis/Hunspell/TextExtendResult.cs new file mode 100644 index 0000000..8728cd9 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/TextExtendResult.cs @@ -0,0 +1,18 @@ +using Skybrud.TextAnalysis.Search; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class TextExtendResult { + + public Word[][] Words { get; } + + public AndList Query { get; } + + public TextExtendResult(Word[][] words, AndList query) { + Words = words; + Query = query; + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/Variant.cs b/src/Skybrud.TextAnalysis/Hunspell/Variant.cs new file mode 100644 index 0000000..6cab15d --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/Variant.cs @@ -0,0 +1,16 @@ +namespace Skybrud.TextAnalysis.Hunspell { + + public class Variant { + + public string Type { get; } + + public string Value { get; } + + public Variant(string value, string type) { + Value = value; + Type = type; + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspell/Word.cs b/src/Skybrud.TextAnalysis/Hunspell/Word.cs new file mode 100644 index 0000000..dd5b33d --- /dev/null +++ b/src/Skybrud.TextAnalysis/Hunspell/Word.cs @@ -0,0 +1,41 @@ +using System; + +namespace Skybrud.TextAnalysis.Hunspell { + + public class Word { + + public WordType Type { get; set; } + + public string Value { get; set; } + + public bool IsMisspelled { get; set; } + + public string Comment { get; set; } + + public Word(WordType type, string value) { + Type = type; + Value = value.Split(new [] {"(underbegreb)"}, StringSplitOptions.RemoveEmptyEntries)[0]; + } + + public Word(WordType type, string value, string comment) : this(type, value) { + Comment = comment; + } + + public static Word Suggestion(string value, string from) { + return new Word(WordType.Suggestion, value, "via " + from); + } + + public static Word Stem(string value, string from) { + return new Word(WordType.Stem, value, "via " + from); + } + + } + + public enum WordType { + Input, + Stem, + Suggestion, + Synonym + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Hunspellx64.dll b/src/Skybrud.TextAnalysis/Hunspellx64.dll new file mode 100644 index 0000000..55a29ec Binary files /dev/null and b/src/Skybrud.TextAnalysis/Hunspellx64.dll differ diff --git a/src/Skybrud.TextAnalysis/Hunspellx86.dll b/src/Skybrud.TextAnalysis/Hunspellx86.dll new file mode 100644 index 0000000..7bc6cf2 Binary files /dev/null and b/src/Skybrud.TextAnalysis/Hunspellx86.dll differ diff --git a/src/Skybrud.TextAnalysis/Search/AndList.cs b/src/Skybrud.TextAnalysis/Search/AndList.cs new file mode 100644 index 0000000..20e4dca --- /dev/null +++ b/src/Skybrud.TextAnalysis/Search/AndList.cs @@ -0,0 +1,15 @@ +namespace Skybrud.TextAnalysis.Search { + + public class AndList : ListBase { + + public override string Operator => "AND"; + + public AndList() { } + + public AndList(params object[] items) { + Query.AddRange(items); + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Search/ListBase.cs b/src/Skybrud.TextAnalysis/Search/ListBase.cs new file mode 100644 index 0000000..d138c33 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Search/ListBase.cs @@ -0,0 +1,89 @@ +using System.Collections.Generic; +using Skybrud.Umbraco.Search.Models.Options; + +namespace Skybrud.TextAnalysis.Search { + + public abstract class ListBase { + + public abstract string Operator { get; } + + public List Query { get; set; } + + public int Count => Query.Count; + + public string Name { get; set; } + + protected ListBase() { + Query = new List(); + } + + public void Append(string item) { + if (item.Length == 2 && item[1] == '\'') return; + Query.Add(item); + } + + public string ToRawQuery(string[] fields) { + + List temp = new List(); + + foreach (object item in Query) { + + if (item is string str) { + List temp2 = new List(); + foreach (var field in fields) { + temp2.Add(field + ":" + str); + } + temp.Add("(" + string.Join(" OR ", temp2) + ")"); + } + + if (item is ListBase list) { + temp.Add(list.ToRawQuery(fields)); + } + + } + + return "(" + string.Join(" " + Operator + " ", temp) + ")"; + + } + + public string ToRawQuery(FieldList fields) { + + List temp = new List(); + + foreach (object item in Query) { + + if (item is string str) { + + List temp2 = new List(); + + foreach (Field field in fields) { + + if (field.Boost != null) { + temp2.Add($"{field.FieldName}:({str} {str}*)^{field.Boost}"); + } + + if (field.Fuzz != null) { + temp2.Add($"{field.FieldName}:{str}~{field.Fuzz}"); + } + + temp2.Add(field.FieldName + ":" + str); + + } + + temp.Add("(" + string.Join(" OR ", temp2) + ")"); + + } + + if (item is ListBase list) { + temp.Add(list.ToRawQuery(fields)); + } + + } + + return "(" + string.Join(" " + Operator + " ", temp) + ")"; + + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Search/OrList.cs b/src/Skybrud.TextAnalysis/Search/OrList.cs new file mode 100644 index 0000000..aec0208 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Search/OrList.cs @@ -0,0 +1,15 @@ +namespace Skybrud.TextAnalysis.Search { + + public class OrList : ListBase { + + public override string Operator => "OR"; + + public OrList() { } + + public OrList(params object[] items) { + Query.AddRange(items); + } + + } + +} \ No newline at end of file diff --git a/src/Skybrud.TextAnalysis/Skybrud.TextAnalysis.csproj b/src/Skybrud.TextAnalysis/Skybrud.TextAnalysis.csproj new file mode 100644 index 0000000..0391726 --- /dev/null +++ b/src/Skybrud.TextAnalysis/Skybrud.TextAnalysis.csproj @@ -0,0 +1,54 @@ + + + + + + net472 + + + + 1.0.0 + Anders Bjerner + Skybrud.dk + Hunspell text analasis package for .NET. + Copyright © 2020 + + + + + + bin\$(Configuration)\$(TargetFramework)\Skybrud.TextAnalasis.xml + + + + + + + + + + False + ..\assemblies\Skybrud.Umbraco.Search.dll + + + + + + PreserveNewest + + + PreserveNewest + + + + \ No newline at end of file diff --git a/src/assemblies/Skybrud.Umbraco.Search.dll b/src/assemblies/Skybrud.Umbraco.Search.dll new file mode 100644 index 0000000..6ed74b4 Binary files /dev/null and b/src/assemblies/Skybrud.Umbraco.Search.dll differ