Skip to content

Commit f93e973

Browse files
authored
Created sample for text normalizing API. (dotnet#3133)
1 parent 21b5bb4 commit f93e973

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class NormalizeText
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create an empty data sample list. The 'NormalizeText' API does not require training data as
16+
// the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator.
17+
// The empty list is only needed to pass input schema to the pipeline.
18+
var emptySamples = new List<TextData>();
19+
20+
// Convert sample list to an empty IDataView.
21+
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
22+
23+
// A pipeline for normalizing text.
24+
var normTextPipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text",
25+
Transforms.Text.TextNormalizingEstimator.CaseMode.Lower,
26+
keepDiacritics: false,
27+
keepPunctuations: false,
28+
keepNumbers: false);
29+
30+
// Fit to data.
31+
var normTextTransformer = normTextPipeline.Fit(emptyDataView);
32+
33+
// Create the prediction engine to get the normalized text from the input text/string.
34+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(normTextTransformer);
35+
36+
// Call the prediction API.
37+
var data = new TextData() { Text = "ML.NET's NormalizeText API changes the case of the TEXT and removes/keeps diâcrîtîcs, punctuations, and/or numbers (123)." };
38+
var prediction = predictionEngine.Predict(data);
39+
40+
// Print the normalized text.
41+
Console.WriteLine($"Normalized Text: {prediction.NormalizedText}");
42+
43+
// Expected output:
44+
// Normalized Text: mlnets normalizetext api changes the case of the text and removeskeeps diacritics punctuations andor numbers
45+
}
46+
47+
public class TextData
48+
{
49+
public string Text { get; set; }
50+
}
51+
52+
public class TransformedTextData : TextData
53+
{
54+
public string NormalizedText { get; set; }
55+
}
56+
}
57+
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,13 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi
107107
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
108108
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
109109
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
110+
/// <example>
111+
/// <format type="text/markdown">
112+
/// <![CDATA[
113+
/// [!code-csharp[NormalizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs)]
114+
/// ]]>
115+
/// </format>
116+
/// </example>
110117
public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.TextTransforms catalog,
111118
string outputColumnName,
112119
string inputColumnName = null,

0 commit comments

Comments
 (0)