From 454726de8ce8007c1acde5f37070e133a9e2101a Mon Sep 17 00:00:00 2001
From: wuchunfu <319355703@qq.com>
Date: Sat, 15 Feb 2025 19:04:47 +0800
Subject: [PATCH] BIGTOP-4361: Add Chinese check

---
 .github/check_chinese_character.py            |  95 ++++++++++
 .github/workflows/ci.yml                      |  16 +-
 bigtop-manager-bom/pom.xml                    |   6 -
 bigtop-manager-common/pom.xml                 |   5 -
 .../utils/ChineseCharacterCheckTest.java      | 165 ------------------
 5 files changed, 99 insertions(+), 188 deletions(-)
 create mode 100644 .github/check_chinese_character.py
 delete mode 100644 bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java

diff --git a/.github/check_chinese_character.py b/.github/check_chinese_character.py
new file mode 100644
index 000000000..51dc71af8
--- /dev/null
+++ b/.github/check_chinese_character.py
@@ -0,0 +1,95 @@
+import os
+import re
+from pathlib import Path
+from typing import List, Set
+
+class ChineseCharacterCheckTest:
+    CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]')
+    # Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name.
+    EXCLUDED_DIRS_AND_FILES = {
+        "target",
+        "node_modules",
+        "dist",
+    }
+    # Supported file extensions
+    SUPPORTED_EXTENSIONS = {".java", ".kt", ".scala", ".js", ".ts", ".vue"}
+
+    def should_not_contain_chinese_in_comments(self):
+        violations = self.scan_for_chinese_characters(ScanTarget.COMMENTS)
+        self.assert_no_chinese_characters(violations)
+
+    def scan_for_chinese_characters(self, target: 'ScanTarget') -> List[str]:
+        violations = []
+        for ext in self.SUPPORTED_EXTENSIONS:
+            for path in Path("..").rglob(f"*{ext}"):
+                if self.is_valid_file(path) and not self.is_excluded(path):
+                    self.process_file(path, target, violations)
+        return violations
+
+    def is_excluded(self, path: Path) -> bool:
+        path_str = str(path)
+        return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES)
+
+    def is_valid_file(self, path: Path) -> bool:
+        path_str = str(path)
+        return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS)
+
+    def process_file(self, path: Path, target: 'ScanTarget', violations: List[str]):
+        try:
+            with open(path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                if target.include_comments():
+                    self.check_comments(content, path, violations)
+                if target.include_code():
+                    self.check_code(content, path, violations)
+        except Exception as e:
+            print(f"Error processing file: {path}")
+            print(e)
+
+    def check_comments(self, content: str, path: Path, violations: List[str]):
+        # Matching multiple types of comments
+        comment_patterns = [
+            r'//.*?$',  # Single line comments
+            r'/\*.*?\*/',  # Multi line comments
+            r'<!--.*?-->'  # Vue/HTML,/javascript/typescript comments
+        ]
+        for pattern in comment_patterns:
+            for comment in re.findall(pattern, content, re.DOTALL | re.MULTILINE):
+                if self.CHINESE_CHAR_PATTERN.search(comment):
+                    violations.append(self.format_violation(path, "comment", comment.strip()))
+
+    def check_code(self, content: str, path: Path, violations: List[str]):
+        # Matching string literals in multiple languages
+        string_patterns = [
+            r'"[^"]*"',  # Double quoted strings
+            r"'[^']*'"   # Single quoted strings
+        ]
+        for pattern in string_patterns:
+            for string_literal in re.findall(pattern, content):
+                if self.CHINESE_CHAR_PATTERN.search(string_literal):
+                    violations.append(self.format_violation(path, "code", string_literal))
+
+    def format_violation(self, path: Path, location: str, content: str) -> str:
+        return f"Chinese characters found in {location} at {path.absolute()}: {content}"
+
+    def assert_no_chinese_characters(self, violations: List[str]):
+        assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}"
+
+class ScanTarget:
+    def __init__(self, check_comments: bool, check_code: bool):
+        self.check_comments = check_comments
+        self.check_code = check_code
+
+    def include_comments(self) -> bool:
+        return self.check_comments
+
+    def include_code(self) -> bool:
+        return self.check_code
+
+ScanTarget.COMMENTS = ScanTarget(True, False)
+ScanTarget.CODE = ScanTarget(False, True)
+ScanTarget.ALL = ScanTarget(True, True)
+
+if __name__ == "__main__":
+    test = ChineseCharacterCheckTest()
+    test.should_not_contain_chinese_in_comments()
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5174d8080..bd91fad8f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,19 +43,11 @@ jobs:
         with:
           submodules: true
       - name: Set JDK
-        uses: actions/setup-java@v4
+        uses: actions/setup-python@v4
         with:
-          distribution: 'temurin'
-          java-version: '17'
-          cache: 'maven'
-      - name: Cache local Maven repository
-        uses: actions/cache@v4
-        with:
-          path: ~/.m2/repository
-          key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
-          restore-keys: |
-            ${{ runner.os }}-maven-
-      - run: ./mvnw clean test -Dskip.pnpm -Dskip.installnodepnpm -Dskip.pnpm.test -DfailIfNoTests=false -Dtest=ChineseCharacterCheckTest
+          python-version: '3.13'
+          cache: 'pip' # caching pip dependencies
+      - run: python .github/check-chinese-character.py
 
   unit-tests-java:
     name: "Run unit test(Java)"
diff --git a/bigtop-manager-bom/pom.xml b/bigtop-manager-bom/pom.xml
index 85efae8a2..ce8f88b89 100644
--- a/bigtop-manager-bom/pom.xml
+++ b/bigtop-manager-bom/pom.xml
@@ -53,7 +53,6 @@
         <langchain4j.version>0.35.0</langchain4j.version>
         <mybatis-spring-boot-starter.version>3.0.3</mybatis-spring-boot-starter.version>
         <pagehelper-spring-boot-starter.version>2.1.0</pagehelper-spring-boot-starter.version>
-        <javaparser.version>3.26.3</javaparser.version>
     </properties>
 
     <dependencyManagement>
@@ -279,11 +278,6 @@
                 <artifactId>langchain4j-reactor</artifactId>
                 <version>${langchain4j.version}</version>
             </dependency>
-            <dependency>
-                <groupId>com.github.javaparser</groupId>
-                <artifactId>javaparser-core</artifactId>
-                <version>${javaparser.version}</version>
-            </dependency>
         </dependencies>
     </dependencyManagement>
 </project>
diff --git a/bigtop-manager-common/pom.xml b/bigtop-manager-common/pom.xml
index f93338cdf..ad71ab9d8 100644
--- a/bigtop-manager-common/pom.xml
+++ b/bigtop-manager-common/pom.xml
@@ -91,10 +91,5 @@
             <artifactId>jakarta.annotation-api</artifactId>
         </dependency>
 
-        <dependency>
-            <groupId>com.github.javaparser</groupId>
-            <artifactId>javaparser-core</artifactId>
-            <scope>test</scope>
-        </dependency>
     </dependencies>
 </project>
diff --git a/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java b/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java
deleted file mode 100644
index dc8636ed5..000000000
--- a/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.bigtop.manager.common.utils;
-
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.github.javaparser.JavaParser;
-import com.github.javaparser.ParseResult;
-import com.github.javaparser.ast.CompilationUnit;
-import com.github.javaparser.ast.expr.StringLiteralExpr;
-
-import java.io.IOException;
-import java.nio.file.FileVisitOption;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Pattern;
-import java.util.stream.Stream;
-
-/**
- * Test case for checking Chinese characters in Java files
- */
-public class ChineseCharacterCheckTest {
-
-    private static final Logger log = LoggerFactory.getLogger(ChineseCharacterCheckTest.class);
-
-    private static final Pattern CHINESE_CHAR_PATTERN = Pattern.compile("[\u4e00-\u9fa5]");
-    private static final Set<String> EXCLUDED_FILES = new HashSet<>(Collections.singletonList("Metrics"));
-    private static final String MAIN_SOURCE_DIR = "src/main/java";
-    private static final String TEST_SOURCE_DIR = "src/test/java";
-
-    private final JavaParser javaParser = new JavaParser();
-    private final String sourceDir;
-    private final String testDir;
-
-    public ChineseCharacterCheckTest() {
-        boolean isWindowsOs = System.getProperty("os.name").toLowerCase().startsWith("win");
-        String separator = isWindowsOs ? "\\" : "/";
-        this.sourceDir = MAIN_SOURCE_DIR.replace("/", separator);
-        this.testDir = TEST_SOURCE_DIR.replace("/", separator);
-    }
-
-    @Test
-    void shouldNotContainChineseInComments() {
-        List<String> violations = scanForChineseCharacters(ScanTarget.COMMENTS);
-        assertNoChineseCharacters(violations);
-    }
-
-    private List<String> scanForChineseCharacters(ScanTarget target) {
-        List<String> violations = new ArrayList<>();
-        try (Stream<Path> paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) {
-            paths.filter(this::isValidJavaFile).forEach(path -> processFile(path, target, violations));
-        } catch (IOException e) {
-            throw new RuntimeException("Failed to scan Java files", e);
-        }
-        return violations;
-    }
-
-    private boolean isValidJavaFile(Path path) {
-        String pathStr = path.toString();
-        return pathStr.endsWith(".java")
-                && (pathStr.contains(sourceDir) || pathStr.contains(testDir))
-                && EXCLUDED_FILES.stream().noneMatch(pathStr::contains);
-    }
-
-    private void processFile(Path path, ScanTarget target, List<String> violations) {
-        try {
-            ParseResult<CompilationUnit> parseResult = javaParser.parse(Files.newInputStream(path));
-            parseResult.getResult().ifPresent(cu -> {
-                if (target.includeComments()) {
-                    checkComments(cu, path, violations);
-                }
-                if (target.includeCode()) {
-                    checkCode(cu, path, violations);
-                }
-            });
-        } catch (Exception e) {
-            log.error("Error processing file: {}", path, e);
-        }
-    }
-
-    private void checkComments(CompilationUnit cu, Path path, List<String> violations) {
-        cu.getAllContainedComments().stream()
-                .filter(comment ->
-                        CHINESE_CHAR_PATTERN.matcher(comment.getContent()).find())
-                .forEach(comment -> violations.add(
-                        formatViolation(path, "comment", comment.getContent().trim())));
-    }
-
-    private void checkCode(CompilationUnit cu, Path path, List<String> violations) {
-        cu.findAll(StringLiteralExpr.class).stream()
-                .filter(str -> CHINESE_CHAR_PATTERN.matcher(str.getValue()).find())
-                .forEach(str -> violations.add(formatViolation(path, "code", str.getValue())));
-    }
-
-    private String formatViolation(Path path, String location, String content) {
-        return String.format("Chinese characters found in %s at %s: %s", location, path.toAbsolutePath(), content);
-    }
-
-    private void assertNoChineseCharacters(List<String> violations) {
-        Assertions.assertEquals(
-                0,
-                violations.size(),
-                () -> String.format(
-                        "Found Chinese characters in files:%n%s", String.join(System.lineSeparator(), violations)));
-    }
-
-    /**
-     * Defines what content should be checked for Chinese characters
-     */
-    private enum ScanTarget {
-        /**
-         * Check only comments
-         */
-        COMMENTS(true, false),
-        /**
-         * Check only code (string literals)
-         */
-        CODE(false, true),
-        /**
-         * Check both comments and code
-         */
-        ALL(true, true);
-
-        private final boolean checkComments;
-        private final boolean checkCode;
-
-        ScanTarget(boolean checkComments, boolean checkCode) {
-            this.checkComments = checkComments;
-            this.checkCode = checkCode;
-        }
-
-        public boolean includeComments() {
-            return checkComments;
-        }
-
-        public boolean includeCode() {
-            return checkCode;
-        }
-    }
-}