From 454726de8ce8007c1acde5f37070e133a9e2101a Mon Sep 17 00:00:00 2001 From: wuchunfu <319355703@qq.com> Date: Sat, 15 Feb 2025 19:04:47 +0800 Subject: [PATCH] BIGTOP-4361: Add Chinese check --- .github/check_chinese_character.py | 95 ++++++++++ .github/workflows/ci.yml | 16 +- bigtop-manager-bom/pom.xml | 6 - bigtop-manager-common/pom.xml | 5 - .../utils/ChineseCharacterCheckTest.java | 165 ------------------ 5 files changed, 99 insertions(+), 188 deletions(-) create mode 100644 .github/check_chinese_character.py delete mode 100644 bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java diff --git a/.github/check_chinese_character.py b/.github/check_chinese_character.py new file mode 100644 index 000000000..51dc71af8 --- /dev/null +++ b/.github/check_chinese_character.py @@ -0,0 +1,95 @@ +import os +import re +from pathlib import Path +from typing import List, Set + +class ChineseCharacterCheckTest: + CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]') + # Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name. + EXCLUDED_DIRS_AND_FILES = { + "target", + "node_modules", + "dist", + } + # Supported file extensions + SUPPORTED_EXTENSIONS = {".java", ".kt", ".scala", ".js", ".ts", ".vue"} + + def should_not_contain_chinese_in_comments(self): + violations = self.scan_for_chinese_characters(ScanTarget.COMMENTS) + self.assert_no_chinese_characters(violations) + + def scan_for_chinese_characters(self, target: 'ScanTarget') -> List[str]: + violations = [] + for ext in self.SUPPORTED_EXTENSIONS: + for path in Path("..").rglob(f"*{ext}"): + if self.is_valid_file(path) and not self.is_excluded(path): + self.process_file(path, target, violations) + return violations + + def is_excluded(self, path: Path) -> bool: + path_str = str(path) + return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES) + + def is_valid_file(self, path: Path) -> bool: + path_str = str(path) + return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS) + + def process_file(self, path: Path, target: 'ScanTarget', violations: List[str]): + try: + with open(path, 'r', encoding='utf-8') as file: + content = file.read() + if target.include_comments(): + self.check_comments(content, path, violations) + if target.include_code(): + self.check_code(content, path, violations) + except Exception as e: + print(f"Error processing file: {path}") + print(e) + + def check_comments(self, content: str, path: Path, violations: List[str]): + # Matching multiple types of comments + comment_patterns = [ + r'//.*?$', # Single line comments + r'/\*.*?\*/', # Multi line comments + r'' # Vue/HTML,/javascript/typescript comments + ] + for pattern in comment_patterns: + for comment in re.findall(pattern, content, re.DOTALL | re.MULTILINE): + if self.CHINESE_CHAR_PATTERN.search(comment): + violations.append(self.format_violation(path, "comment", comment.strip())) + + def check_code(self, content: str, path: Path, violations: List[str]): + # Matching string literals in multiple languages + string_patterns = [ + r'"[^"]*"', # Double quoted strings + r"'[^']*'" # Single quoted strings + ] + for pattern in string_patterns: + for string_literal in re.findall(pattern, content): + if self.CHINESE_CHAR_PATTERN.search(string_literal): + violations.append(self.format_violation(path, "code", string_literal)) + + def format_violation(self, path: Path, location: str, content: str) -> str: + return f"Chinese characters found in {location} at {path.absolute()}: {content}" + + def assert_no_chinese_characters(self, violations: List[str]): + assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}" + +class ScanTarget: + def __init__(self, check_comments: bool, check_code: bool): + self.check_comments = check_comments + self.check_code = check_code + + def include_comments(self) -> bool: + return self.check_comments + + def include_code(self) -> bool: + return self.check_code + +ScanTarget.COMMENTS = ScanTarget(True, False) +ScanTarget.CODE = ScanTarget(False, True) +ScanTarget.ALL = ScanTarget(True, True) + +if __name__ == "__main__": + test = ChineseCharacterCheckTest() + test.should_not_contain_chinese_in_comments() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5174d8080..bd91fad8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,19 +43,11 @@ jobs: with: submodules: true - name: Set JDK - uses: actions/setup-java@v4 + uses: actions/setup-python@v4 with: - distribution: 'temurin' - java-version: '17' - cache: 'maven' - - name: Cache local Maven repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-maven- - - run: ./mvnw clean test -Dskip.pnpm -Dskip.installnodepnpm -Dskip.pnpm.test -DfailIfNoTests=false -Dtest=ChineseCharacterCheckTest + python-version: '3.13' + cache: 'pip' # caching pip dependencies + - run: python .github/check-chinese-character.py unit-tests-java: name: "Run unit test(Java)" diff --git a/bigtop-manager-bom/pom.xml b/bigtop-manager-bom/pom.xml index 85efae8a2..ce8f88b89 100644 --- a/bigtop-manager-bom/pom.xml +++ b/bigtop-manager-bom/pom.xml @@ -53,7 +53,6 @@ 0.35.0 3.0.3 2.1.0 - 3.26.3 @@ -279,11 +278,6 @@ langchain4j-reactor ${langchain4j.version} - - com.github.javaparser - javaparser-core - ${javaparser.version} - diff --git a/bigtop-manager-common/pom.xml b/bigtop-manager-common/pom.xml index f93338cdf..ad71ab9d8 100644 --- a/bigtop-manager-common/pom.xml +++ b/bigtop-manager-common/pom.xml @@ -91,10 +91,5 @@ jakarta.annotation-api - - com.github.javaparser - javaparser-core - test - diff --git a/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java b/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java deleted file mode 100644 index dc8636ed5..000000000 --- a/bigtop-manager-common/src/test/java/org/apache/bigtop/manager/common/utils/ChineseCharacterCheckTest.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.bigtop.manager.common.utils; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.github.javaparser.JavaParser; -import com.github.javaparser.ParseResult; -import com.github.javaparser.ast.CompilationUnit; -import com.github.javaparser.ast.expr.StringLiteralExpr; - -import java.io.IOException; -import java.nio.file.FileVisitOption; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.regex.Pattern; -import java.util.stream.Stream; - -/** - * Test case for checking Chinese characters in Java files - */ -public class ChineseCharacterCheckTest { - - private static final Logger log = LoggerFactory.getLogger(ChineseCharacterCheckTest.class); - - private static final Pattern CHINESE_CHAR_PATTERN = Pattern.compile("[\u4e00-\u9fa5]"); - private static final Set EXCLUDED_FILES = new HashSet<>(Collections.singletonList("Metrics")); - private static final String MAIN_SOURCE_DIR = "src/main/java"; - private static final String TEST_SOURCE_DIR = "src/test/java"; - - private final JavaParser javaParser = new JavaParser(); - private final String sourceDir; - private final String testDir; - - public ChineseCharacterCheckTest() { - boolean isWindowsOs = System.getProperty("os.name").toLowerCase().startsWith("win"); - String separator = isWindowsOs ? "\\" : "/"; - this.sourceDir = MAIN_SOURCE_DIR.replace("/", separator); - this.testDir = TEST_SOURCE_DIR.replace("/", separator); - } - - @Test - void shouldNotContainChineseInComments() { - List violations = scanForChineseCharacters(ScanTarget.COMMENTS); - assertNoChineseCharacters(violations); - } - - private List scanForChineseCharacters(ScanTarget target) { - List violations = new ArrayList<>(); - try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { - paths.filter(this::isValidJavaFile).forEach(path -> processFile(path, target, violations)); - } catch (IOException e) { - throw new RuntimeException("Failed to scan Java files", e); - } - return violations; - } - - private boolean isValidJavaFile(Path path) { - String pathStr = path.toString(); - return pathStr.endsWith(".java") - && (pathStr.contains(sourceDir) || pathStr.contains(testDir)) - && EXCLUDED_FILES.stream().noneMatch(pathStr::contains); - } - - private void processFile(Path path, ScanTarget target, List violations) { - try { - ParseResult parseResult = javaParser.parse(Files.newInputStream(path)); - parseResult.getResult().ifPresent(cu -> { - if (target.includeComments()) { - checkComments(cu, path, violations); - } - if (target.includeCode()) { - checkCode(cu, path, violations); - } - }); - } catch (Exception e) { - log.error("Error processing file: {}", path, e); - } - } - - private void checkComments(CompilationUnit cu, Path path, List violations) { - cu.getAllContainedComments().stream() - .filter(comment -> - CHINESE_CHAR_PATTERN.matcher(comment.getContent()).find()) - .forEach(comment -> violations.add( - formatViolation(path, "comment", comment.getContent().trim()))); - } - - private void checkCode(CompilationUnit cu, Path path, List violations) { - cu.findAll(StringLiteralExpr.class).stream() - .filter(str -> CHINESE_CHAR_PATTERN.matcher(str.getValue()).find()) - .forEach(str -> violations.add(formatViolation(path, "code", str.getValue()))); - } - - private String formatViolation(Path path, String location, String content) { - return String.format("Chinese characters found in %s at %s: %s", location, path.toAbsolutePath(), content); - } - - private void assertNoChineseCharacters(List violations) { - Assertions.assertEquals( - 0, - violations.size(), - () -> String.format( - "Found Chinese characters in files:%n%s", String.join(System.lineSeparator(), violations))); - } - - /** - * Defines what content should be checked for Chinese characters - */ - private enum ScanTarget { - /** - * Check only comments - */ - COMMENTS(true, false), - /** - * Check only code (string literals) - */ - CODE(false, true), - /** - * Check both comments and code - */ - ALL(true, true); - - private final boolean checkComments; - private final boolean checkCode; - - ScanTarget(boolean checkComments, boolean checkCode) { - this.checkComments = checkComments; - this.checkCode = checkCode; - } - - public boolean includeComments() { - return checkComments; - } - - public boolean includeCode() { - return checkCode; - } - } -}