Skip to content

Commit b441a41

Browse files
committed
[master] - extended language checker
1 parent 82631dd commit b441a41

File tree

1 file changed

+29
-9
lines changed

1 file changed

+29
-9
lines changed

src/main/java/util/validator/LanguageChecker.java

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,23 @@ public static Optional<LdLocale> getRecognisedLanguage(String text) throws IOExc
3333
return languageDetector.detect(textObject);
3434
}
3535

36+
public static Optional<LdLocale> getRecognisedLanguage(WebDriver driver) throws IOException {
37+
List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
38+
39+
LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
40+
.withProfiles(languageProfiles)
41+
.build();
42+
43+
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
44+
45+
TextObject textObject = textObjectFactory.forText(getTextFromPage(driver));
46+
47+
return languageDetector.detect(textObject);
48+
}
49+
3650
public static boolean isCorrectLanguageOnThePage(WebDriver driver, String lang) throws IOException {
3751
boolean isCorrectLang = true;
38-
JavascriptExecutor jse = (JavascriptExecutor) driver;
39-
String bodyText = jse.executeScript("return document.body.innerHTML", "").toString();
40-
bodyText = bodyText.replaceAll("<script\\b[^<]*(?:(?!<\\/script>)<[^<]*)*<\\/script>", " ");
41-
bodyText = bodyText.replaceAll("<noscript\\b[^<]*(?:(?!<\\/noscript>)<[^<]*)*<\\/noscript>", " ");
42-
bodyText = bodyText.replaceAll("<style\\b[^<]*(?:(?!<\\/style>)<[^<]*)*<\\/style>", " ");
43-
bodyText = bodyText.replaceAll("<pre\\b[^<]*(?:(?!<\\/pre>)<[^<]*)*<\\/pre>", " ");
44-
bodyText = bodyText.replaceAll("<[^>]*>", " ");
45-
bodyText = bodyText.toLowerCase().replaceAll("[\\t|\\n|\\r|\\s]+", " ").replaceAll("[\\s]+", " ");
52+
String bodyText = getTextFromPage(driver);
4653

4754
int textBlockLength = 300;
4855
int bodyTextLength = bodyText.length();
@@ -53,7 +60,7 @@ public static boolean isCorrectLanguageOnThePage(WebDriver driver, String lang)
5360
} else {
5461
for (int i = 0; i < bodyTextLength; i += textBlockLength) {
5562
String tempString;
56-
if (bodyTextLength >= (i + textBlockLength) ) {
63+
if (bodyTextLength >= (i + textBlockLength)) {
5764
tempString = bodyText.substring(i, i + textBlockLength);
5865
try {
5966
String detectedLanguage = getRecognisedLanguage(tempString).get().getLanguage();
@@ -75,4 +82,17 @@ public static boolean isCorrectLanguageOnThePage(WebDriver driver, String lang)
7582
}
7683
return isCorrectLang;
7784
}
85+
86+
private static String getTextFromPage(WebDriver driver) {
87+
JavascriptExecutor jse = (JavascriptExecutor) driver;
88+
String bodyText = jse.executeScript("return document.body.innerHTML", "").toString();
89+
bodyText = bodyText.replaceAll("<script\\b[^<]*(?:(?!</script>)<[^<]*)*</script>", " ")
90+
.replaceAll("<noscript\\b[^<]*(?:(?!</noscript>)<[^<]*)*</noscript>", " ")
91+
.replaceAll("<style\\b[^<]*(?:(?!</style>)<[^<]*)*</style>", " ")
92+
.replaceAll("<pre\\b[^<]*(?:(?!</pre>)<[^<]*)*</pre>", " ")
93+
.replaceAll("<[^>]*>", " ").toLowerCase()
94+
.replaceAll("[\\t|\\n|\\r|\\s]+", " ").replaceAll("[\\s]+", " ");
95+
96+
return bodyText;
97+
}
7898
}

0 commit comments

Comments
 (0)