diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index 8cd490bc8..5778b5fd8 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -1,22 +1,47 @@ import argparse import sys import os +import re from junitparser import JUnitXml, Error, Failure, Skipped -parser = argparse.ArgumentParser() -parser.add_argument('junitxml', nargs='+') +parser = argparse.ArgumentParser(description='Test results analyzer') +parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files') args = parser.parse_args() failures = [] -suites = [] +summaries = [] + +error_types = [ + "RuntimeError", + "ValueError", + "TypeError", + "AttributeError", + "KeyError", + "IndexError", + "ImportError", + "AssertionError", + "Exception", + "OSError", + "Failed", + "TimeoutError", + "asyncio.TimeoutError", + "FileNotFoundError", + "PermissionError", + "NotImplementedError", +] def get_classname(case): - return ' '.join(case.classname.split()) + return ' '.join(case.classname.split()) if hasattr(case, 'classname') else case.get('classname', '') def get_name(case): + if isinstance(case, dict): + return case.get('name', '') return ' '.join(case.name.split()) def get_result(case): + if isinstance(case, dict): + return case.get('status', 'failed') + result = "passed" if case.result: if isinstance(case.result[0], Error): @@ -28,88 +53,252 @@ def get_result(case): return result def get_message(case): + if isinstance(case, dict): + return case.get('error', '') + if not case.result: return "" - return f"{case.result[0].message.splitlines()[0]}" -def print_md_row(row, print_header): + full_text = case.result[0].text if hasattr(case.result[0], 'text') else case.result[0].message + if not full_text: + return "" + + error_messages = [] + capture_next_lines = False + indent_level = 0 + + for line in full_text.splitlines(): + stripped_line = line.strip() + if not stripped_line: + continue + + for error_type in error_types: + if stripped_line.startswith(error_type + ": "): + error_msg = stripped_line[len(error_type)+2:] + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + elif f"{error_type}:" in stripped_line and "Traceback" not in stripped_line: + error_msg = stripped_line.split(f'{error_type}:')[-1].strip() + error_messages.append(f"{error_type}: {error_msg}") + capture_next_lines = True + indent_level = 0 + break + + return " ; ".join(error_messages) if error_messages else f"{case.result[0].message.splitlines()[0]}" + + +def print_md_row(row, print_header=False, fail_list=None): if print_header: - header = " | ".join([f"{key}" for key, _ in row.items()]) + header = " | ".join([f"{key}" for key in row.keys()]) print(f"| {header} |") - header = " | ".join(["-"*len(key) for key, _ in row.items()]) + header = " | ".join(["---"] * len(row)) print(f"| {header} |") - row = " | ".join([f"{value}" for _, value in row.items()]) - print(f"| {row} |") + row_values = " | ".join([f"{value}" for value in row.values()]) + print(f"| {row_values} |") -def print_cases(cases): - print_header = True - for case in cases: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - message = get_message(case) - row = { - 'Class name': classname, - 'Test name': name, - 'Status': result, - 'Message': message, - } - print_md_row(row, print_header) - print_header = False + if fail_list != None: + fail_list.write(f"| {row_values} |\n") + + +def get_similar_issues(classname, name, result, message): + import requests + + os.environ["http_proxy"] = "" + os.environ["https_proxy"] = "" + DEFAULT_HOST_IP = "10.112.100.138" + + def QnA(request, host_ip=DEFAULT_HOST_IP): + import json + url = f"http://{host_ip}:8888/v1/chatqna" + + headers = {"Content-Type": "application/json"} + + response = requests.post(url, headers=headers, json=request) + return response + + prompt = f"unit test {name} {result} with {message}, is it a known issue? If yes, what is the issue id? And what is the owner and root cuase?" + + request = { + "messages": prompt, + "stream": False + } + + response = QnA (request) + if response.status_code==200: + result = response.json()["choices"][0]["message"]["content"] + answer = result.split("")[-1].strip() + answer = answer.split("**Answer:**")[-1].strip() + return answer + return "" + +def print_failures(): + if not failures: + return + + with open("ut_failure_list.csv", "w") as fail_list: + fail_list.write("sep=\'|\''.\n") + + print("### Test Failures") + print_header = True + for case in failures: + #issue = get_similar_issues(get_classname(case), get_name(case), get_result(case), get_message(case)) + print_md_row({ + 'Class name': get_classname(case), + 'Test name': get_name(case), + 'Status': get_result(case), + 'Message': get_message(case), + #'Similar issue': issue, + 'Source': case['source'] if isinstance(case, dict) else 'XML' + }, print_header, fail_list) -def print_suite(suite): + print_header = False + +def parse_log_file(log_file): + with open(log_file, encoding='utf-8') as f: + content = f.read() + + ut_name = os.path.splitext(os.path.basename(log_file))[0] + summary = { + 'Category': determine_category(ut_name), + 'UT': ut_name, + 'Test cases': 0, + 'Passed': 0, + 'Skipped': 0, + 'Failures': 0, + 'Errors': 0, + 'Source': 'Log' + } + + # Extract test counts + test_run_match = re.search(r"Ran (\d+) tests in [\d.]+s", content) + if test_run_match: + summary['Test cases'] = int(test_run_match.group(1)) + + # Extract skipped case number + skipped_match = re.search(r"skipped[ =](\d+)", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + else: + skipped_match = re.search(r"skipped (\d+) cases?", content, re.IGNORECASE) + if skipped_match: + summary['Skipped'] = int(skipped_match.group(1)) + + # Extract failures + failure_blocks = re.findall(r"(FAIL:.*?)(?:\n\n|\n=+\n|\Z)", content, re.DOTALL) + exist_test_names = set() + failures_number = 0 + + for block in failure_blocks: + case_match = re.match(r"FAIL: (\w+) \(__mp_main__\.(\w+)\)", block) + if not case_match: + continue + + test_name = case_match.group(1) + if test_name in exist_test_names: + continue + exist_test_names.add(test_name) + + error_msg = [] + error_pattern = r"(" + "|".join(error_types) + r"):.*?(?=\n\S|\n\n|\n=+\n|\Z)" + error_matches = re.finditer(error_pattern, block, re.DOTALL) + if not error_matches and "Traceback" in block: + error_msg.append("Unknown error (see traceback)") + else: + for match in error_matches: + error_msg.append(match.group(0).strip()) + + failures.append({ + 'classname': ut_name, + 'name': f"{case_match.group(2)}:{test_name}", + 'error': " ".join(error_msg), + 'status': 'failed', + 'source': 'Log' + }) + failures_number += 1 + + if failures_number > summary['Failures']: + summary['Failures'] = failures_number + summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped'] + + return summary + +def determine_category(ut): + if ut == 'op_regression': + return 'op_regression' + elif ut == 'op_regression_dev1': + return 'op_regression_dev1' + elif ut == 'op_extended': + return 'op_extended' + elif 'op_ut' in ut: + return 'op_ut' + else: + return 'unknown' + +def process_log_file(log_file): + try: + summary = parse_log_file(log_file) + summaries.append(summary) + except Exception as e: + print(f"Error processing {log_file}: {e}", file=sys.stderr) + +def process_xml_file(xml_file): + try: + xml = JUnitXml.fromfile(xml_file) + ut = os.path.basename(xml_file).split('.')[0] + category = determine_category(ut) + + for suite in xml: + suite_summary = { + 'Category': category, + 'UT': ut, + 'Test cases': suite.tests, + 'Passed': suite.tests - suite.skipped - suite.failures - suite.errors, + 'Skipped': suite.skipped, + 'Failures': suite.failures, + 'Errors': suite.errors, + 'Source': 'XML' + } + summaries.append(suite_summary) + + for case in suite: + if get_result(case) not in ["passed", "skipped"]: + failures.append(case) + except Exception as e: + print(f"Error processing {xml_file}: {e}", file=sys.stderr) + +def print_summary(): + print("### Results Summary") print_header = True - for suite in suites: - ut = args.junitxml[0] - del(args.junitxml[0]) - ut = os.path.basename(ut).split('.')[0] - tests = suite.tests - skipped = suite.skipped - failures = suite.failures - errors = suite.errors - if ut == 'op_regression': - category = 'op_regression' - elif ut == 'op_regression_dev1': - category = 'op_regression_dev1' - elif ut == 'op_extended': - category = 'op_extended' - elif 'op_ut' in ut: - category = 'op_ut' - row = { - 'Category': category, - 'UT': ut, - 'Test cases': tests, - 'Passed': tests-skipped-failures-errors, - 'Skipped': skipped, - 'Failures': failures, - 'Errors': errors, - } - print_md_row(row, print_header) + + + for summary in summaries: + print_md_row({ + 'Category': summary['Category'], + 'UT': summary['UT'], + 'Test cases': summary['Test cases'], + 'Passed': summary['Passed'], + 'Skipped': summary['Skipped'], + 'Failures': summary['Failures'], + 'Errors': summary['Errors'], + 'Source': summary['Source'] + }, print_header) + print_header = False -xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ] -for idx, xml in enumerate(xmls): - for suite in xml: - suites.append(suite) - for case in suite: - classname = get_classname(case) - name = get_name(case) - result = get_result(case) - if result not in ["passed", "skipped"]: - failures.append(case) - -printed = False -def print_break(needed): - if needed: - print("") - -if failures: - print_break(printed) - print("### Failures") - print_cases(failures) - printed = True - -print("### Results Summary") -print_suite(suites) - -sys.exit(0) +def main(): + for input_file in args.input_files: + if input_file.endswith('.log'): + process_log_file(input_file) + elif input_file.endswith('.xml'): + process_xml_file(input_file) + else: + print(f"Skipping unknown file type: {input_file}", file=sys.stderr) + + print_failures() + print_summary() + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 3fb1a1997..9bf611786 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -72,14 +72,14 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then echo -e "[PASS] UT ${ut_suite} test Pass" fi fi -if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED|have failures" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log - num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") +if [[ "${ut_suite}" == 'xpu_distributed' || "${ut_suite}" == 'pytorch_distributed' ]]; then + grep -E "^FAILED|have failures" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_test_failed.log + num_failed_distributed=$(wc -l < "./${ut_suite}_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} xpu distributed" + echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" - cat "./${ut_suite}_xpu_distributed_test_failed.log" - ((num_failed=num_failed_xpu_distributed)) + cat "./${ut_suite}_test_failed.log" + ((num_failed=num_failed_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index b67be9f29..ae6c2064c 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -65,7 +65,11 @@ jobs: source activate xpu_build cd ../ && rm -rf pytorch pip install requests - git clone https://github.com/pytorch/pytorch pytorch + if [[ ${{ inputs.pytorch }} == 'distributed_2.8' ]]; then + git clone https://github.com/daisyden/pytorch.git pytorch + else + git clone https://github.com/pytorch/pytorch pytorch + fi cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) # apply PRs for stock pytorch python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py @@ -167,11 +171,11 @@ jobs: if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/torch*.whl - name: Upload Build Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: - name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ env.TORCH_COMMIT_ID }} path: ${{ github.workspace }}/pytorch_*.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index b409d5774..afd8afdf0 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -44,7 +44,7 @@ permissions: read-all jobs: ut_test: runs-on: ${{ inputs.runner }} - if: ${{ inputs.ut != 'xpu_distributed' }} + if: ${{ inputs.ut != 'xpu_distributed' && inputs.ut != 'pytorch_distributed' }} timeout-minutes: 900 env: NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} @@ -95,7 +95,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -182,6 +182,18 @@ jobs: cd ../pytorch/third_party/torch-xpu-ops/test/xpu timeout 10000 python run_test_with_skip.py 2>${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test_error.log | tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log cp *.xml ${{ github.workspace }}/ut_log + find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + dir_path=$(dirname "$1"); + case "$dir_path" in + *"op_ut_with_skip_quantization/core"*) + dir_name="op_ut_with_skip_quantization_core";; + *) + dir_name=$(basename "$dir_path");; + esac; + mv "$1" "$dir_path/${dir_name}_$(basename "$1")" + ' _ {} \; + cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. @@ -254,6 +266,7 @@ jobs: source activate xpu_op_${ZE_AFFINITY_MASK} pip install junitparser python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + if [ -f "ut_failure_list.csv"];then cp ut_failure_list.csv ${{ github.workspace }}/ut_log/. fi - name: UT Test Results Check shell: bash run: | @@ -336,7 +349,7 @@ jobs: if: ${{ inputs.pytorch != 'nightly_wheel' }} uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | @@ -417,3 +430,284 @@ jobs: with: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-xpu_distributed path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.pytorch }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log + + pytorch_distributed_test: + runs-on: ${{ inputs.runner }} + if: contains(inputs.ut, 'pytorch_distributed') + timeout-minutes: 900 + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../ && rm -rf pytorch + pip install requests + git clone https://github.com/daisyden/pytorch.git pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi + fi + - name: Triton Installation + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + if [ -z ${{ inputs.triton }} ]; then + TRITON_COMMIT_ID="$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" + else + TRITON_COMMIT_ID="${{ inputs.triton }}" + fi + echo ${TRITON_REPO}@${TRITON_COMMIT_ID} + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" + fi + - name: Download Pytorch wheel + if: ${{ inputs.pytorch != 'nightly_wheel' }} + uses: actions/download-artifact@v4 + with: + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }} + path: ${{ github.workspace }} + - name: Install Pytorch XPU + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + source .github/scripts/env.sh ${{ inputs.pytorch }} + pip install mkl-static==2025.0.1 mkl-include==2025.0.1 + if [[ ${{ inputs.abi }} == '0' ]]; then + export _GLIBCXX_USE_CXX11_ABI=0 + else + export _GLIBCXX_USE_CXX11_ABI=1 + fi + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch + export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + pip install -r requirements.txt + pip install --force-reinstall ${{ github.workspace }}/torch*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + else + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/pytorch_distributed/pytorch_distributed_test.log + cd ${{ github.workspace }} + sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope + - name: UT Test Results Check + shell: bash + run: | + function contains() { + contains_status="echo 'Start $2 ...'" + { + [[ $1 =~ (^|,)$2($|,) ]] + } || { + echo "[Warning] $2 is not suppotted type! Skipped!" + contains_status="continue" + } + } + set -xe + echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + cd ${{ github.workspace }}/ut_log/pytorch_distributed + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + bash ut_result_check.sh 'pytorch_distributed' + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log + - name: Upload XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}-pytorch_distributed + path: ${{ github.workspace }}/ut_log/ut_failure_list.csv diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3dd204e32..0e9ee9f63 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -66,6 +66,31 @@ jobs: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} ut: op_regression,op_regression_dev1,op_extended,op_ut,xpu_distributed runner: linux.idc.xpu + + preci-linux-build-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-lint-check + permissions: + issues: write + uses: ./.github/workflows/_linux_build.yml + with: + pytorch: distributed_2.8 + runner: pvc_e2e + + preci-ut-distributed: + # Don't run on forked repos and draft PRs + secrets: inherit + if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} + name: preci-linux-distributed + needs: preci-linux-build-distributed + uses: ./.github/workflows/_linux_ut.yml + with: + pytorch: ${{ needs.preci-linux-build-distributed.outputs.torch_commit_id }} + ut: pytorch_distributed + runner: pvc_e2e Inductor-XPU-E2E-CI-Tests: name: preci-linux / e2e_test @@ -112,7 +137,7 @@ jobs: if: ${{ inputs.pytorch }} != 'nightly_wheel' uses: actions/download-artifact@v4 with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }} + name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ needs.preci-linux-build.outputs.torch_commit_id }} path: ${{ github.workspace }} - name: Install Pytorch XPU run: | diff --git a/test/xpu/run_distributed_local.py b/test/xpu/run_distributed_local.py new file mode 100644 index 000000000..cc40373bc --- /dev/null +++ b/test/xpu/run_distributed_local.py @@ -0,0 +1,125 @@ +import os +import subprocess +import sys + +from skip_list_dist_local import skip_dict, skip_dict_python + +res = 0 +res2 = 0 +fail_test = [] +error_log = "" + +os.environ["PYTHONPATH"] = "$PYTHONPATH:../../../../test/distributed/pipelining" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) + + +from xpu_test_utils import launch_test + +# run python test +def run(test_command): + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + return result + + +for key in skip_dict_python: + skip_list = skip_dict_python[key] if skip_dict_python[key] else [] + test_command = ["python", key] + fail = run(test_command) + num_skipped = 0 + num_err = 0 + if fail.returncode: + for i, err in enumerate(fail.stderr.split("FAIL: ")): + if i == 0 and len(err) > 0: + error_log += err + continue + is_skipped = False + for skip_case in skip_list: + if skip_case in err: + print("Skipped error: ", key + " " + skip_case) + num_skipped += 1 + is_skipped = True + break + if not is_skipped: + num_err += 1 + res2 += fail.returncode + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + else: + error_log += (line + "\n") + else: + error_log += ("FAIL: " + err) + else: + if i == len(fail.stderr.split("FAIL: ")) - 1: + error_log += "FAIL: " + for line in err.split("\n"): + if line.startswith("FAILED (failures="): + num_errs = line.split("=")[1].split(")")[0].strip() + error_log += ("FAILED (failures=" + str(int(num_errs) - num_skipped) + f" skipped {num_skipped} cases" + ")\n") + + renamed_key = key.replace("../../../../", "").replace("/", "_") + if num_err > 0: + fail_test.append(key) + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(error_log) + else: + import pdb;pdb.set_trace() + with open(f"op_ut_with_skip_{renamed_key}.log", "w") as f: + f.write(fail.stdout) + f.write(fail.stderr) + +# run pytest with skiplist +for key in skip_dict: + skip_list = skip_dict[key] + fail = launch_test(key, skip_list) + res += fail + if fail: + fail_test.append(key) + +if fail_test: + print(",".join(fail_test) + " have failures") + +exit_code = os.WEXITSTATUS(res) +if exit_code == 0: + sys.exit(res2) +else: + sys.exit(exit_code) diff --git a/test/xpu/skip_list_dist_local.py b/test/xpu/skip_list_dist_local.py new file mode 100644 index 000000000..9ec4c59e0 --- /dev/null +++ b/test/xpu/skip_list_dist_local.py @@ -0,0 +1,465 @@ +skip_dict = { + "../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1536 + # "../../../../test/distributed/fsdp/test_distributed_checkpoint.py": ( + # "test_distributed_checkpoint_state_dict_type0_xpu", + # "test_distributed_checkpoint_state_dict_type1_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, + "../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None, + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_basic_checkpoint_end_to_end_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_True_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", + "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", + "test_checkpoint_submodule_use_reentrant_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_ddp_parity_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_bf16_hook_has_wrapping_False_sharding_strategy0", + "test_bf16_hook_has_wrapping_False_sharding_strategy1", + "test_bf16_hook_has_wrapping_False_sharding_strategy2", + "test_bf16_hook_has_wrapping_True_sharding_strategy0", + "test_bf16_hook_has_wrapping_True_sharding_strategy1", + "test_bf16_hook_has_wrapping_True_sharding_strategy2", + "test_fp16_hook_has_wrapping_False_sharding_strategy1", + "test_fp16_hook_has_wrapping_False_sharding_strategy2", + "test_fp16_hook_has_wrapping_True_sharding_strategy0", + "test_fp16_hook_has_wrapping_True_sharding_strategy1", + "test_fp16_hook_has_wrapping_True_sharding_strategy2", + ), + "../../../../test/distributed/fsdp/test_fsdp_core.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_delayed_optim_step_offload_true_no_shard_xpu", + "test_transformer_no_grad_mixed_precision_True_xpu", + "test_delayed_optim_step_offload_false_no_shard_xpu", + "test_delayed_optim_step_offload_false_none_xpu", + "test_delayed_optim_step_offload_false_shard_grad_op_xpu", + "test_delayed_optim_step_offload_true_none_xpu", + "test_delayed_optim_step_offload_true_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_false_no_shard_xpu", + "test_delayed_reduce_scatter_offload_false_none_xpu", + "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", + "test_delayed_reduce_scatter_offload_true_none_xpu", + "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_offload_false_no_shard_xpu", + "test_mixture_of_experts_offload_false_none_xpu", + "test_mixture_of_experts_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_offload_true_none_xpu", + "test_mixture_of_experts_offload_true_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", + "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_false_no_shard_xpu", + "test_nested_always_wrap_model_offload_false_none_xpu", + "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", + "test_nested_always_wrap_model_offload_true_none_xpu", + "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_false_no_shard_xpu", + "test_nested_wrapped_model_offload_false_none_xpu", + "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", + "test_nested_wrapped_model_offload_true_none_xpu", + "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", + "test_transformer_offload_false_none_xpu", + "test_transformer_offload_false_shard_grad_op_xpu", + "test_transformer_offload_true_none_xpu", + "test_transformer_offload_true_shard_grad_op_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + " test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, + "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_hooks_multi_traversal_xpu", + "test_parity_with_ddp_xpu", + "test_parity_with_non_frozen_fsdp_xpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_False_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_GradToNone_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_False_disable_autograd_True_forward_prefetch_True ", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_False_forward_prefetch_True", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_False", + "test_freezing_weights_with_nested_trunk_True_freezing_method_FreezingMethod_RequiresGrad_freeze_after_wrap_fsdp_True_disable_autograd_True_forward_prefetch_True", + ), + "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, + "../../../../test/distributed/fsdp/test_fsdp_grad_acc.py": None, + "../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None, + "../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_memory.py": None, + "../../../../test/distributed/fsdp/test_fsdp_meta.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_fsdp_zero2_eval_with_prefetch", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_optimizer_overlap", + ), + "../../../../test/distributed/fsdp/test_fsdp_mixed_precision.py": None, + "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_multi_forward_cpu", + ), + "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None, + # https://github.com/intel/torch-xpu-ops/issues/1537 + "../../../../test/distributed/fsdp/test_fsdp_optim_state.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_flatten_sharded_optim_state_dict_nested", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_False_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type0_use_multiple_param_groups_True_rank0_only_True_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_False_rank0_only_False_use_diff_optim_inputs_True", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_False", + "test_optim_state_dict_nested_state_dict_type1_use_multiple_param_groups_True_rank0_only_False_use_diff_optim_inputs_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type0_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_False", + "test_rekey_optim_state_dict_to_ids_state_dict_type1_use_multiple_param_groups_True", + "test_rekey_optim_state_dict_to_names", + "test_scatter_full_optim_state_dict_nested_halve_world_size", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_scatter_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_halve_world_size", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_False_wrap_alt_True_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_False_use_diff_optim_inputs_True", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_False", + "test_shard_full_optim_state_dict_nested_use_multiple_param_groups_True_wrap_alt_True_use_diff_optim_inputs_True", + "test_use_orig_params", + ), + # Performance check, skip + # "../../../../test/distributed/fsdp/test_fsdp_overlap.py": ( + # # https://github.com/intel/torch-xpu-ops/issues/1504 + # "test_forward_overlap", + # "test_forward_overlap_xpu", + # ), + "../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None, + "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_false_shard_grad_op_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_none_none_none", + "test_fsdp_ddp_parity_with_grad_scaler_offload_true_shard_grad_op_none_none", + ), + "../../../../test/distributed/fsdp/test_fsdp_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_state_dict_save_load_flow_state_dict_type_local_state_dict", + "test_state_dict_save_load_flow_state_dict_type_sharded_state_dict", + "test_state_dict_save_load_flow_state_dict_type_state_dict", + ), + "../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None, + "../../../../test/distributed/fsdp/test_fsdp_traversal.py": None, + "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, + "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None, + "../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_diff_hyperparams_sharding_strategy_str_full_shard", + "test_diff_hyperparams_sharding_strategy_str_no_shard", + "test_diff_hyperparams_sharding_strategy_str_shard_grad_op", + "test_no_sync_correctness", + ), + "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None, + "../../../../test/distributed/fsdp/test_shard_utils.py": None, + "../../../../test/distributed/fsdp/test_utils.py": None, + "../../../../test/distributed/fsdp/test_wrap.py": None, + "../../../../test/distributed/test_backends.py": None, + "../../../../test/distributed/test_c10d_common.py": None, + "../../../../test/distributed/test_c10d_functional_native.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + "test_reduce_scatter_tensor_coalesced", + "test_reduce_scatter_tensor_single", + # https://github.com/intel/torch-xpu-ops/issues/1525 + # ValueError: trying to initialize the default process group twice! + "test_inductor_all_gather_into_tensor_coalesced", + "test_inductor_all_gather_into_tensor_single", + "test_inductor_all_reduce_coalesced", + "test_inductor_all_reduce_non_contig_input", + "test_inductor_all_reduce_single", + "test_inductor_all_to_all_single", + "test_inductor_broadcast", + "test_inductor_inplace_op_on_view", + "test_inductor_reduce_scatter_tensor_coalesced", + "test_inductor_reduce_scatter_tensor_single", + "test_inductor_reuse_buffer_after_inplace_collective", + "test_ranks_and_tag", + "test_wait_tensor", + ), + "../../../../test/distributed/test_c10d_logger.py": None, + "../../../../test/distributed/test_c10d_object_collectives.py": ( + # RuntimeError: Process 0 terminated or timed out after 300.09047198295593 seconds + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_gather_object_cpu", + "test_gather_object_xpu", + "test_gather_object_list_cpu", + "test_gather_object_list_xpu", + ), + "../../../../test/distributed/test_compute_comm_reordering.py": None, + "../../../../test/distributed/test_control_collectives.py": None, + "../../../../test/distributed/test_device_mesh.py": None, + "../../../../test/distributed/test_dynamo_distributed.py": ( + # AttributeError:'torch._C._distributed_c10d.ProcessGroupXCCL' object has no attribute '_set_default_timeout' + "test_asymmetric_compilation", + "test_asymmetric_compilation_with_fx_cache", + # ValueError: FlexAttention is only supported on CUDA or CPU devices. Found input tensors on xpu device. + "test_compiled_flex_attention_full_model_ddp", + "test_compiled_flex_attention_local_ddp", + # torch._dynamo.exc.InternalTorchDynamoError: AttributeError: __enter__ + # https://github.com/intel/torch-xpu-ops/issues/1527 + "test_compiler_collectives_automatic_dynamic_scalar", + "test_compiler_collectives_automatic_dynamic_speculation_divergence", + "test_compiler_collectives_automatic_dynamic_tensor", + "test_compiler_collectives_dim_mismatch", + "test_compiler_collectives_graph_break_empty_graph_still_collective", + "test_compiler_collectives_missing_source", + "test_compiler_collectives_scalar_missing_source", + "test_compiler_collectives_type_mismatch", + "test_ddp_activation_checkpointing", + "test_ddp_baseline_aot_eager_multiprocess", + "test_fsdp_activation_checkpointing", + "test_fsdp_aot_eager", + "test_fsdp_inductor", + "test_fsdp_setattr", + "test_fsdp_unspecialized_forced_getattr_inline", + "test_fsdp_unspecialized_forced_getattr_no_inline", + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_get_pg_attr", + ), + "../../../../test/distributed/test_fake_pg.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES) + # https://github.com/intel/torch-xpu-ops/issues/1526 + "test_tracing_xpu", + "test_tracing and test_tracing_with_fakepg and test_tracing_with_fakepg_xpu and test_tracing_with_dce_code and test_tracing_with_dce_code_xpu", + ), + "../../../../test/distributed/test_multi_threaded_pg.py": ( + # oneccl not support multi-threaded well, so skip it first. + # https://github.com/intel/torch-xpu-ops/issues/1509 + "test_bwd_sees_fwd_pg", + ), + "../../../../test/distributed/test_store.py": None, + "../../../../test/distributed/pipelining/test_backward.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_stage_backward_weight_multiple_iters_xpu", + "test_stage_backward_weight_xpu", + "test_stage_backward_xpu", + ), + "../../../../test/distributed/pipelining/test_microbatch.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_chunk_spec_xpu", + ), + "../../../../test/distributed/pipelining/test_pipe.py": None, + "../../../../test/distributed/pipelining/test_schedule.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, + "../../../../test/distributed/pipelining/test_unflatten.py": None, + "../../../../test/distributed/tensor/parallel/test_micro_pipeline_tp.py": ( + # NotImplementedError: The operator 'symm_mem::fused_matmul_reduce_scatter' + # is not currently implemented for the XPU device + # https://github.com/intel/torch-xpu-ops/issues/1547 + "test_dtensor_seq_par_shard_dim_0", + "test_dtensor_seq_par_shard_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + # AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: ......' + # https://github.com/intel/torch-xpu-ops/issues/1548 + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_matmul_A_dims_3_gather_dim_1_return_A_True", + # AssertionError: 'fused_all_gather_scaled_matmul' not found in 'graph():\n......' + # https://github.com/intel/torch-xpu-ops/issues/1549 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_0_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_1_return_A_True", + # NotImplementedError: The operator 'aten::_scaled_mm.out' is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1550 + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_2_gather_dim_1_return_A_True", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_False", + "test_fuse_all_gather_scaled_matmul_A_dims_3_gather_dim_2_return_A_True", + # NotImplementedError: The operator 'symm_mem::fused_scaled_matmul_reduce_scatter' + # is not currently implemented for the XPU device. + # https://github.com/intel/torch-xpu-ops/issues/1551 + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_2_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_A_dims_3_scatter_dim_2", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_0", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_1", + "test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape_scatter_dim_2", + ), + "../../../../test/distributed/tensor/parallel/test_tp_examples.py": ( + # RuntimeError: aten.add.Tensor: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators! + # https://github.com/intel/torch-xpu-ops/issues/1555 + "test_transformer_req_grad_seq_parallel_float32_thaw_all", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_0_attention_wv__layers_0_feed_forward_w1__layers_1_feed_forward_w2__layers_1_ffn_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_layers_1_ffn_norm__norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output__tok_embeddings", + "test_transformer_req_grad_seq_parallel_float32_thaw_output__tok_embeddings", + "test_transformer_training_is_seq_parallel_False_float32", + "test_transformer_training_is_seq_parallel_True_float32", + # NotImplementedError: Operator aten._scaled_dot_product_fused_attention_overrideable.default does not have a sharding strategy registered. + # https://github.com/intel/torch-xpu-ops/issues/1556 + "test_transformer_req_grad_seq_parallel_float32_thaw_norm__output", + ), + "../../../../test/distributed/tensor/parallel/test_tp_random_state.py": None, + "../../../../test/distributed/tensor/parallel/test_parallelize_api.py": None, + "../../../../test/distributed/tensor/parallel/test_tp_style.py": None, + "../../../../test/distributed/tensor/test_api.py": None, + "../../../../test/distributed/tensor/test_attention.py": None, + "../../../../test/distributed/tensor/test_common_rules.py": None, + "../../../../test/distributed/tensor/test_dtensor.py": ( + # Passed with updated test code for world_size 8 + "test_auto_implicit_replication", + "test_default_value_sub_mesh", + "test_device_mesh_nd", + "test_dtensor_2d_mesh", + "test_dtensor_api_device_mesh_context_manager", + "test_dtensor_device_mesh_device_conversion", + "test_dtensor_spec_local_shard_offset", + "test_from_local_sub_mesh", + "test_implicit_replication", + "test_metadata_consistency_check", + "test_redistribute_sub_mesh", + "test_split_tensor_1D", + ), + "../../../../test/distributed/tensor/test_dtensor_compile.py": ( + # https://jira.devtools.intel.com/browse/MLSL-3625 + "test_2d_fsdp_tp_compile", + ), + "../../../../test/distributed/tensor/test_experimental_ops.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_bernoulli", + ), + "../../../../test/distributed/tensor/test_init.py": None, + "../../../../test/distributed/tensor/test_math_ops.py": ( + # RuntimeError: oneCCL: coll_param.cpp:455 validate: EXCEPTION: average operation is not supported for the scheduler path + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_mean", + "test_nll_loss_and_cross_entropy", + ), + "../../../../test/distributed/tensor/test_random_ops.py": ( + # Need to update world size + "test_hsdp_tp_model_meta_init", + ), + "../../../../test/distributed/tensor/test_redistribute.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_redistribute_shard_dim_multi_dim_mesh", + ), + "../../../../test/distributed/tensor/test_tensor_ops.py": None, + "../../../../test/distributed/tensor/experimental/test_register_sharding.py": None, + "../../../../test/distributed/_shard/test_sharder.py": None, + # FSDP2 + "../../../../test/distributed/_composable/fsdp/test_fully_shard_autograd.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_clip_grad_norm_2d", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1571 + "test_set_reduce_scatter_divide_factor", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_extensions.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_gradient_scaler", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_init.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_logging.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_memory.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1535 + "test_fully_shard_training_memory", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_overlap.py": ( + # Performance test, should skip + "test_fully_shard_training_overlap", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1572 + "test_dp_state_dict_cpu_offload", + ), + "../../../../test/distributed/_composable/fsdp/test_fully_shard_state.py": None, + "../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1508 + "test_post_optim_event", + # https://github.com/intel/torch-xpu-ops/issues/1504 + "test_train_parity_multi_group_unshard_async_op", + "test_train_parity_with_activation_checkpointing", + ), +} + +skip_dict_python = { + "distributed/test_c10d_ops_xccl.py": None, + "distributed/test_c10d_xccl.py": None, + # "../../../../test/distributed/pipelining/test_schedule_multiproc.py": None, # Hang error. + "../../../../test/distributed/pipelining/test_stage.py": None, + "../../../../test/distributed/pipelining/test_transformer.py": None, +}