BIGTOP-4361: Add Chinese check

apache · Feb 15, 2025 · 454726d · 454726d
1 parent 774a0ea
commit 454726d
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 188 deletions.
diff --git a/.github/check_chinese_character.py b/.github/check_chinese_character.py
@@ -0,0 +1,95 @@
+import os
+import re
+from pathlib import Path
+from typing import List, Set
+
+class ChineseCharacterCheckTest:
+    CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]')
+    # Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name.
+    EXCLUDED_DIRS_AND_FILES = {
+        "target",
+        "node_modules",
+        "dist",
+    }
+    # Supported file extensions
+    SUPPORTED_EXTENSIONS = {".java", ".kt", ".scala", ".js", ".ts", ".vue"}
+
+    def should_not_contain_chinese_in_comments(self):
+        violations = self.scan_for_chinese_characters(ScanTarget.COMMENTS)
+        self.assert_no_chinese_characters(violations)
+
+    def scan_for_chinese_characters(self, target: 'ScanTarget') -> List[str]:
+        violations = []
+        for ext in self.SUPPORTED_EXTENSIONS:
+            for path in Path("..").rglob(f"*{ext}"):
+                if self.is_valid_file(path) and not self.is_excluded(path):
+                    self.process_file(path, target, violations)
+        return violations
+
+    def is_excluded(self, path: Path) -> bool:
+        path_str = str(path)
+        return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES)
+
+    def is_valid_file(self, path: Path) -> bool:
+        path_str = str(path)
+        return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS)
+
+    def process_file(self, path: Path, target: 'ScanTarget', violations: List[str]):
+        try:
+            with open(path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                if target.include_comments():
+                    self.check_comments(content, path, violations)
+                if target.include_code():
+                    self.check_code(content, path, violations)
+        except Exception as e:
+            print(f"Error processing file: {path}")
+            print(e)
+
+    def check_comments(self, content: str, path: Path, violations: List[str]):
+        # Matching multiple types of comments
+        comment_patterns = [
+            r'//.*?$',  # Single line comments
+            r'/\*.*?\*/',  # Multi line comments
+            r'<!--.*?-->'  # Vue/HTML,/javascript/typescript comments
+        ]
+        for pattern in comment_patterns:
+            for comment in re.findall(pattern, content, re.DOTALL | re.MULTILINE):
+                if self.CHINESE_CHAR_PATTERN.search(comment):
+                    violations.append(self.format_violation(path, "comment", comment.strip()))
+
+    def check_code(self, content: str, path: Path, violations: List[str]):
+        # Matching string literals in multiple languages
+        string_patterns = [
+            r'"[^"]*"',  # Double quoted strings
+            r"'[^']*'"   # Single quoted strings
+        ]
+        for pattern in string_patterns:
+            for string_literal in re.findall(pattern, content):
+                if self.CHINESE_CHAR_PATTERN.search(string_literal):
+                    violations.append(self.format_violation(path, "code", string_literal))
+
+    def format_violation(self, path: Path, location: str, content: str) -> str:
+        return f"Chinese characters found in {location} at {path.absolute()}: {content}"
+
+    def assert_no_chinese_characters(self, violations: List[str]):
+        assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}"
+
+class ScanTarget:
+    def __init__(self, check_comments: bool, check_code: bool):
+        self.check_comments = check_comments
+        self.check_code = check_code
+
+    def include_comments(self) -> bool:
+        return self.check_comments
+
+    def include_code(self) -> bool:
+        return self.check_code
+
+ScanTarget.COMMENTS = ScanTarget(True, False)
+ScanTarget.CODE = ScanTarget(False, True)
+ScanTarget.ALL = ScanTarget(True, True)
+
+if __name__ == "__main__":
+    test = ChineseCharacterCheckTest()
+    test.should_not_contain_chinese_in_comments()
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,19 +43,11 @@ jobs:
         with:
           submodules: true
       - name: Set JDK
-        uses: actions/setup-java@v4
+        uses: actions/setup-python@v4
         with:
-          distribution: 'temurin'
-          java-version: '17'
-          cache: 'maven'
-      - name: Cache local Maven repository
-        uses: actions/cache@v4
-        with:
-          path: ~/.m2/repository
-          key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
-          restore-keys: |
-            ${{ runner.os }}-maven-
-      - run: ./mvnw clean test -Dskip.pnpm -Dskip.installnodepnpm -Dskip.pnpm.test -DfailIfNoTests=false -Dtest=ChineseCharacterCheckTest
+          python-version: '3.13'
+          cache: 'pip' # caching pip dependencies
+      - run: python .github/check-chinese-character.py
 
   unit-tests-java:
     name: "Run unit test(Java)"

diff --git a/bigtop-manager-bom/pom.xml b/bigtop-manager-bom/pom.xml
@@ -53,7 +53,6 @@
         <langchain4j.version>0.35.0</langchain4j.version>
         <mybatis-spring-boot-starter.version>3.0.3</mybatis-spring-boot-starter.version>
         <pagehelper-spring-boot-starter.version>2.1.0</pagehelper-spring-boot-starter.version>
-        <javaparser.version>3.26.3</javaparser.version>
     </properties>
 
     <dependencyManagement>
@@ -279,11 +278,6 @@
                 <artifactId>langchain4j-reactor</artifactId>
                 <version>${langchain4j.version}</version>
             </dependency>
-            <dependency>
-                <groupId>com.github.javaparser</groupId>
-                <artifactId>javaparser-core</artifactId>
-                <version>${javaparser.version}</version>
-            </dependency>
         </dependencies>
     </dependencyManagement>
 </project>
diff --git a/bigtop-manager-common/pom.xml b/bigtop-manager-common/pom.xml
@@ -91,10 +91,5 @@
             <artifactId>jakarta.annotation-api</artifactId>
         </dependency>
 
-        <dependency>
-            <groupId>com.github.javaparser</groupId>
-            <artifactId>javaparser-core</artifactId>
-            <scope>test</scope>
-        </dependency>
     </dependencies>
 </project>
diff --git a/...ommon/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java b/...ommon/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java