diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index f1c97bd9a9..36fc6c6bfc 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -16,8 +16,6 @@ outputs: pytest_extra_args: value: ${{ steps.runner.outputs.pytest_extra_args }} -permissions: read-all - runs: using: composite steps: @@ -59,10 +57,17 @@ runs: if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1} }' |wc -l)" cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')" - pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v cx="${cpus_per_xpu}" '{ + pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{ if (x > 0) { + split(z, xpu_list, ","); for (i=0;i ${log_dir}/op_regression_test_error.log |tee ${log_dir}/op_regression_test.log + echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/regressions" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression.log + echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression.log - name: op_regression_dev1 shell: timeout 300 bash -xe {0} if: ${{ inputs.ut_name == 'op_regression_dev1' || inputs.ut_name == 'basic' }} @@ -30,6 +35,8 @@ runs: timeout 180 pytest test_operation_on_device_1.py \ --junit-xml=${{ github.workspace }}/ut_log/op_regression_dev1.xml \ 2> ${log_dir}/op_regression_dev1_test_error.log |tee ${log_dir}/op_regression_dev1_test.log + echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/regressions" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression_dev1.log + echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_regression_dev1.log - name: op_transformers shell: timeout 3600 bash -xe {0} if: ${{ inputs.ut_name == 'op_transformers' || inputs.ut_name == 'basic' }} @@ -41,6 +48,8 @@ runs: pytest test/test_transformers.py -k xpu \ --junit-xml=${{ github.workspace }}/ut_log/op_transformers.xml \ 2> ${log_dir}/op_transformers_test_error.log |tee ${log_dir}/op_transformers_test.log + echo -e "File Path: cd pytorch" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_transformers.log + echo -e "Reproduce Command: pytest -sv test/failed_case -k xpu" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_transformers.log - name: op_extended shell: timeout 3600 bash -xe {0} if: ${{ inputs.ut_name == 'op_extended' || inputs.ut_name == 'basic' }} @@ -53,6 +62,8 @@ runs: 2> ${log_dir}/op_extended_test_error.log |tee ${log_dir}/op_extended_test.log ls -al cp *.xml ${{ github.workspace }}/ut_log + echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/xpu/extended" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_extended.log + echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_extended.log - name: op_ut shell: timeout 18000 bash -xe {0} if: ${{ inputs.ut_name == 'op_ut' }} @@ -89,6 +100,8 @@ runs: tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_only_test.log ls -al cp *.xml ${{ github.workspace }}/ut_log + echo -e "File Path: cd pytorch/third_party/torch-xpu-ops/test/xpu" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_ut.log + echo -e "Reproduce Command: pytest -sv failed_case" | tee -a ${{ github.workspace }}/ut_log/reproduce_op_ut.log - name: torch_xpu shell: timeout 3600 bash -xe {0} if: ${{ inputs.ut_name == 'torch_xpu' }} @@ -129,7 +142,6 @@ runs: python test/profiling/llama.py | \ tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv - bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv # All xpu ut under test/profiler cd ../../test/profiler @@ -147,10 +159,6 @@ runs: if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m - sudo rm -rf ${{ github.workspace }}/ptrace_scope.bk - sudo cp /proc/sys/kernel/yama/ptrace_scope ${{ github.workspace }}/ptrace_scope.bk - cat ${{ github.workspace }}/ptrace_scope.bk - echo "0" |sudo tee /proc/sys/kernel/yama/ptrace_scope mkdir -p ut_log/xpu_distributed cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") @@ -161,3 +169,39 @@ runs: python run_distributed.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + + # Summary + - name: UT Test Results Summary + shell: timeout 180 bash -xe {0} + run: | + pip install junitparser + python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true + # Check the failure logs + if ls ${{ github.workspace }}/failures*.log 1> /dev/null 2>&1; then + echo -e "Exist Failure logs" + echo "Found Failure logs as below: " + for file in ${{ github.workspace }}/failures*.log; do + echo " - $file" + cp "$file" ${{ github.workspace }}/ut_log + done + echo -e "Failure logs Copied" + else + echo -e "No Failure logs" + fi + # Copied the passed logs + if ls passed*.log 1> /dev/null 2>&1; then + cp passed*.log ${{ github.workspace }}/ut_log + echo -e "Passed logs Copied" + else + echo -e "No Passed logs" + fi + # Copied the Summary logs + if ls category*.log 1> /dev/null 2>&1; then + cp category*.log ${{ github.workspace }}/ut_log + echo -e "Category logs Copied" + else + echo -e "No Category logs" + fi + if [ -e ut_failure_list.csv ];then + cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true + fi diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index bb0ec2a55e..8704ed0fc6 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -20,17 +20,20 @@ inputs: runs: using: composite steps: + - name: Check Python + shell: bash -xe {0} + run: | + which python && python -V + which pip && pip list - name: Prepare dataset shell: bash -xe {0} run: | # dataset - if [ ! -d ${HOME}/datasets/imagenet ];then - rm -rf ${HOME}/datasets/imagenet - mkdir -p ${HOME}/datasets/imagenet - cd ${HOME}/datasets/imagenet - wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar - tar -xf ILSVRC2012_img_val.tar + if [ ! -d ${dataset_dir} ];then + rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir} wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh + wget -q https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar + tar -xf ILSVRC2012_img_val.tar bash valprep.sh fi - name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }}) @@ -41,7 +44,7 @@ runs: echo "Mode,Model,Dtype,Result" |tee ${pt2e_logs_dir}/summary.csv if [[ "${{ inputs.scenario }}" == *"accuracy"* ]];then models="alexnet,mnasnet1_0,mobilenet_v2,mobilenet_v3_large,resnet152,resnet18,resnet50,resnext50_32x4d,shufflenet_v2_x1_0,squeezenet1_1,vgg16" - cmd_line=" python pt2e-accuracy/scripts/modelbench/quant/inductor_quant_acc.py --device xpu --dataset_dir ${HOME}/datasets/imagenet " + cmd_line=" python pt2e-accuracy/scripts/modelbench/quant/inductor_quant_acc.py --device xpu --dataset_dir ${dataset_dir} " for model_name in $(echo $models |sed 's/,/ /g') do if [[ "${{ inputs.dt }}" == *"float32"* ]];then diff --git a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv index 4a60aecac6..0359c4024e 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_timm_models_training.csv @@ -52,7 +52,7 @@ sebotnet33ts_256,pass,pass,pass,pass,pass selecsls42b,pass,pass,pass,pass,pass spnasnet_100,pass,pass,pass,pass,pass # https://github.com/intel/torch-xpu-ops/issues/1768 -swin_base_patch4_window7_224,pass,pass,pass,pass,pass +swin_base_patch4_window7_224,pass,fail_accuracy,fail_accuracy,pass,pass swsl_resnext101_32x16d,pass,pass,pass,pass,pass tf_efficientnet_b0,pass,pass,pass,pass,pass tf_mixnet_l,pass,pass,pass,pass,pass diff --git a/.github/scripts/check-ut.py b/.github/scripts/check-ut.py index c9afb73eb8..c21f9ceda1 100644 --- a/.github/scripts/check-ut.py +++ b/.github/scripts/check-ut.py @@ -3,6 +3,7 @@ import os import re from junitparser import JUnitXml, Error, Failure, Skipped +from collections import defaultdict parser = argparse.ArgumentParser(description='Test results analyzer') parser.add_argument('input_files', nargs='+', help='JUnit XML files or log files') @@ -10,6 +11,16 @@ failures = [] summaries = [] +failures_by_category = defaultdict(list) +passed_cases = [] +passed_by_category = defaultdict(list) +category_totals = defaultdict(lambda: { + 'Test cases': 0, + 'Passed': 0, + 'Skipped': 0, + 'Failures': 0, + 'Errors': 0 +}) error_types = [ "RuntimeError", @@ -38,6 +49,14 @@ def get_name(case): return case.get('name', '') return ' '.join(case.name.split()) +def get_category_from_case(case): + if isinstance(case, dict): + return case.get('category', 'unknown') + else: + if hasattr(case, '_file_category'): + return case._file_category + return 'unknown' + def get_result(case): if isinstance(case, dict): return case.get('status', 'failed') @@ -108,6 +127,7 @@ def print_failures(failure_list=None): print_header = True for case in failures: print_md_row({ + 'Category': get_category_from_case(case), 'Class name': get_classname(case), 'Test name': get_name(case), 'Status': get_result(case), @@ -116,13 +136,33 @@ def print_failures(failure_list=None): }, print_header, failure_list=failure_list) print_header = False +def generate_failures_log(): + if not failures: + return + + for case in failures: + category = get_category_from_case(case) + failures_by_category[category].append(case) + + for category, category_failures in failures_by_category.items(): + if not category_failures: + continue + + log_filename = f"failures_{category}.log" + with open(log_filename, "w", encoding='utf-8') as log_file: + for case in category_failures: + class_name = get_classname(case) + test_name = get_name(case) + log_file.write(f"{category},{class_name},{test_name}\n") + def parse_log_file(log_file): with open(log_file, encoding='utf-8') as f: content = f.read() ut_name = os.path.splitext(os.path.basename(log_file))[0] + category = determine_category(ut_name) summary = { - 'Category': determine_category(ut_name), + 'Category': category, 'UT': ut_name, 'Test cases': 0, 'Passed': 0, @@ -170,19 +210,29 @@ def parse_log_file(log_file): for match in error_matches: error_msg.append(match.group(0).strip()) - failures.append({ + failure_case = { 'classname': ut_name, 'name': f"{case_match.group(2)}:{test_name}", 'error': " ".join(error_msg), 'status': 'failed', - 'source': 'Log' - }) + 'source': 'Log', + 'category': category + } + failures.append(failure_case) + failures_by_category[category].append(failure_case) failures_number += 1 if failures_number > summary['Failures']: summary['Failures'] = failures_number summary['Passed'] = summary['Test cases'] - summary['Failures'] - summary['Skipped'] + # Update category totals + category_totals[category]['Test cases'] += summary['Test cases'] + category_totals[category]['Passed'] += summary['Passed'] + category_totals[category]['Skipped'] += summary['Skipped'] + category_totals[category]['Failures'] += summary['Failures'] + category_totals[category]['Errors'] += summary['Errors'] + return summary def determine_category(ut): @@ -192,6 +242,10 @@ def determine_category(ut): return 'op_regression_dev1' elif ut == 'op_extended': return 'op_extended' + elif ut == 'op_transformers': + return 'op_transformers' + elif ut == 'test_xpu': + return 'test_xpu' elif 'op_ut' in ut: return 'op_ut' else: @@ -223,12 +277,55 @@ def process_xml_file(xml_file): } summaries.append(suite_summary) + # Update category totals + category_totals[category]['Test cases'] += suite_summary['Test cases'] + category_totals[category]['Passed'] += suite_summary['Passed'] + category_totals[category]['Skipped'] += suite_summary['Skipped'] + category_totals[category]['Failures'] += suite_summary['Failures'] + category_totals[category]['Errors'] += suite_summary['Errors'] + for case in suite: if get_result(case) not in ["passed", "skipped"]: + case._file_category = category failures.append(case) + elif get_result(case) == "passed": + case._file_category = category + passed_cases.append(case) + passed_by_category[category].append(case) except Exception as e: print(f"Error processing {xml_file}: {e}", file=sys.stderr) +def generate_passed_log(): + if not passed_cases: + return + + for category, category_passed in passed_by_category.items(): + if not category_passed: + continue + + log_filename = f"passed_{category}.log" + with open(log_filename, "w", encoding='utf-8') as log_file: + for case in category_passed: + class_name = get_classname(case) + test_name = get_name(case) + status = get_result(case) + log_file.write(f"{category},{class_name},{test_name}\n") + +def generate_category_totals_log(): + """Generate log files with category totals""" + for category, totals in category_totals.items(): + if totals['Test cases'] == 0: + continue + + log_filename = f"category_{category}.log" + with open(log_filename, "w", encoding='utf-8') as log_file: + log_file.write(f"Category: {category}\n") + log_file.write(f"Test cases: {totals['Test cases']}\n") + log_file.write(f"Passed: {totals['Passed']}\n") + log_file.write(f"Skipped: {totals['Skipped']}\n") + log_file.write(f"Failures: {totals['Failures']}\n") + log_file.write(f"Errors: {totals['Errors']}\n") + def print_summary(): print("### Results Summary") print_header = True @@ -276,6 +373,10 @@ def main(): with open("ut_failure_list.csv", "w") as failure_list: print_failures(failure_list=failure_list) + + generate_failures_log() + generate_passed_log() + generate_category_totals_log() print_summary() diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index cc3131ccef..0e6b95ec45 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -2,17 +2,13 @@ ut_suite="${1:-op_regression}" # op_regression / op_extended / op_ut / torch_xpu # usage -# compare_and_filter_logs [output.log] +# check_new_failed [output.log] +all_pass="" -compare_and_filter_logs() { +check_new_failed() { local file_UT="$1" local file_known_issue="$2" local output_file="${3:-${file_UT%.*}_filtered.log}" - local filtered_content="${file_UT%.*}_removed.log" - local temp_file="temp_parts.log" - local temp_file_labeled="temp_parts_labeled.log" - local temp_output="${3:-${file_UT%.*}_filtered_temp.log}" - local temp_final="${file_UT%.*}_final_temp.log" if [[ $# -lt 2 ]]; then echo "[ERROR] Need 2 files to compare" @@ -29,36 +25,12 @@ compare_and_filter_logs() { # Filter the same content from file_UT as file_known_issue echo "Filtering $file_known_issue for $file_UT" + if grep -q $'\r' "$file_UT"; then + echo "Detected log from windows" + sed -i 's/\r$//' "$file_UT" + fi grep -vFxf "$file_known_issue" "$file_UT" > "$output_file" - # Keep the filtered UT cases - grep -noFf "$file_known_issue" "$file_UT" > "$filtered_content" - echo "Filtered cases file: $filtered_content" - true > "$temp_file" - true > "$temp_file_labeled" - true > "$temp_output" - true > "$temp_final" - grep -E '\.py$|,' "$output_file" > "$temp_output" - while IFS= read -r line; do - IFS=',' read -ra parts <<< "$line" - for part in "${parts[@]}"; do - part_trimmed=$(echo "$part" | xargs) - if [[ -n "$part_trimmed" ]] && ! grep -qF "$part_trimmed" "$file_known_issue"; then - echo "$part_trimmed" >> "$temp_file" - echo -e "\n\033[1;33m[Check the failed cases in summary line]\033[0m" - echo -e "\033[1;33mCase not found in ${file_known_issue}: '${part_trimmed}' (from line: '${line}')\033[0m" - else - echo -e "\n\033[1;33m[Check the failed cases in summary line]\033[0m" - echo -e "\n\033[1;32m${part_trimmed} found in ${file_known_issue} (from line: '${line}')\033[0m" - fi - done - done < "$temp_output" - - awk '{print $0 " [in summary line]"}' "$temp_file" > "$temp_file_labeled" - grep -vE '\.py$|,' "$output_file" > "$temp_final" - cat "$temp_file_labeled" >> "$temp_final" - mv "$temp_final" "$output_file" - echo -e "\n\033[1;31m[New failed cases Summary]\033[0m" if [[ -z "$(tr -d ' \t\n\r\f' < "$output_file" 2>/dev/null)" ]]; then echo -e "\033[1;32mNo new failed cases found\033[0m" @@ -66,20 +38,15 @@ compare_and_filter_logs() { echo -e "\n\033[1;31mNew failed cases, not in known issues\033[0m" cat "$output_file" fi +} - if [[ -s "$filtered_content" ]]; then - echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m" - awk -F':' '{ - line_number = $1 - $1 = "" - gsub(/^ /, "", $0) - printf "\033[33m%3d\033[0m: %s\n", line_number, $0 - }' "$filtered_content" - else - echo -e "\n\033[1;32mNo Skipped Cases\033[0m" - fi - - rm -f ${temp_output} ${temp_file} ${temp_final} +check_filtered_logs() { + local file_UT="$1" + local file_known_issue="$2" + local filtered_content="${file_UT%.*}_removed.log" + # Keep the filtered UT cases + grep -noFf "$file_known_issue" "$file_UT" > "$filtered_content" + echo "Filtered cases file: $filtered_content" } check_passed_known_issues() { @@ -91,6 +58,10 @@ check_passed_known_issues() { return 1 fi echo "Checking for known issues that are now passing in $file_passed_UT" + if grep -q $'\r' "$file_passed_UT"; then + echo "Detected log from windows" + sed -i 's/\r$//' "$file_passed_UT" + fi grep -Fxf "$file_passed_UT" "$file_known_issue" > "$output_file" echo -e "\n\033[1;32m[New passed cases Summary]\033[0m" if [[ -s "$output_file" ]]; then @@ -99,131 +70,133 @@ check_passed_known_issues() { else echo -e "\033[1;33mNo known issues are now passing\033[0m" fi + + rm -f ${output_file} } -get_pass_fail_log() { - local p_row="$1" - local p_col="$2" - local ut_log="$3" - grep -E "${p_row}" "${ut_log}" | awk -v p="${p_col}" '{ - for (i=1;i<=NF;i++) { - if ($i ~ p) { - print $i; - } - } - }' +check_test_cases() { + local log_file="$1" + declare -A expected_cases=( + ["op_extended"]=5349 + ["op_regression"]=244 + ["op_regression_dev1"]=1 + ["op_transformers"]=237 + ["op_ut"]=120408 + ["test_xpu"]=69 + ) + + if [[ ! -f "$log_file" ]]; then + echo "False" + echo "[ERROR] Need test file $log_file" >&2 + return 1 + fi + + all_pass="true" + local current_category="" + + while IFS= read -r line; do + if [[ $line =~ ^Category:\ ([^[:space:]]+) ]]; then + current_category="${BASH_REMATCH[1]}" + elif [[ $line =~ Test\ cases:\ ([0-9]+) ]] && [[ -n "$current_category" ]]; then + actual_cases="${BASH_REMATCH[1]}" + expected_cases_value="${expected_cases[$current_category]}" + + if [[ -n "$expected_cases_value" ]]; then + threshold=$(echo "$expected_cases_value * 0.95" | bc -l | awk '{print int($1+0.5)}') + + echo "Category: $current_category" + echo "Expected number: $expected_cases_value" + echo "Current number: $actual_cases" + echo "Threshold(95%): $threshold" + + if [[ "$actual_cases" -lt "$threshold" ]]; then + echo " Status: ❌ Abnormal (reduction exceeds 5%)" + all_pass="false" + else + reduction=$(echo "scale=2; ($actual_cases/$expected_cases_value - 1) * 100" | bc -l) + echo " Status: ✅ Normal (reduction ${reduction}%)" + fi + echo "----------------------------------------" + fi + current_category="" + fi + done < "$log_file" } -if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' ]]; then - get_pass_fail_log ".FAILED" "::.*::" "${ut_suite}"_test.log > ./"${ut_suite}"_failed.log - grep -E "Timeout" "${ut_suite}"_test.log | grep "test" >> ./"${ut_suite}"_failed.log - get_pass_fail_log "PASSED" "::.*::" "${ut_suite}"_test.log > ./"${ut_suite}"_passed.log + +if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'op_transformers' || "${ut_suite}" == 'op_ut' || "${ut_suite}" == 'test_xpu' ]]; then echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite}" echo -e "=========================================================================" - cat "./${ut_suite}_failed.log" - echo -e "=========================================================================" - echo -e "Checking Failed cases in ${ut_suite}" - echo -e "=========================================================================" - compare_and_filter_logs "${ut_suite}"_failed.log Known_issue.log - echo -e "=========================================================================" - echo -e "Checking New passed cases in Known issue list for ${ut_suite}" - echo -e "=========================================================================" - check_passed_known_issues "${ut_suite}"_passed.log Known_issue.log - if [[ -f "${ut_suite}_failed_filtered.log" ]]; then - num_failed=$(wc -l < "./${ut_suite}_failed_filtered.log") - else - num_failed=$(wc -l < "./${ut_suite}_failed.log") - fi - num_passed=$(wc -l < "./${ut_suite}_passed.log") - if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]]; then - echo -e "[ERROR] UT ${ut_suite} test Fail" - exit 1 + if [[ -f "failures_${ut_suite}.log" ]]; then + cat "./failures_${ut_suite}.log" else - echo -e "[PASS] UT ${ut_suite} test Pass" + echo -e "\033[1;32mNo failed cases\033[0m" fi -fi -if [[ "${ut_suite}" == 'op_ut' ]]; then - get_pass_fail_log ".FAILED" "::.*::" op_ut_with_skip_test.log > ./"${ut_suite}"_with_skip_test_failed.log - grep -E "Timeout" op_ut_with_skip_test.log | grep "test" >> ./"${ut_suite}"_with_skip_test_failed.log - get_pass_fail_log "PASSED" "::.*::" op_ut_with_skip_test.log > ./"${ut_suite}"_with_skip_test_passed.log - get_pass_fail_log ".FAILED" "::.*::" op_ut_with_only_test.log > ./"${ut_suite}"_with_only_test_failed.log - grep -E "Timeout" op_ut_with_only_test.log | grep "test" >> ./"${ut_suite}"_with_only_test_failed.log - get_pass_fail_log "PASSED" "::.*::" op_ut_with_only_test.log > ./"${ut_suite}"_with_only_test_passed.log - echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} with skip" - echo -e "=========================================================================" - cat "./${ut_suite}_with_skip_test_failed.log" echo -e "=========================================================================" - echo -e "Checking Failed cases in ${ut_suite} with skip" - echo -e "=========================================================================" - compare_and_filter_logs "${ut_suite}"_with_skip_test_failed.log Known_issue.log - echo -e "=========================================================================" - echo -e "Checking New passed cases in Known issue list for ${ut_suite} with skip" - echo -e "=========================================================================" - check_passed_known_issues "${ut_suite}"_with_skip_test_passed.log Known_issue.log - if [[ -f "${ut_suite}_with_skip_test_failed_filtered.log" ]]; then - num_failed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_failed_filtered.log") + echo -e "Checking Test case number for ${ut_suite}" + echo -e "=========================================================================" + check_test_cases category_${ut_suite}.log + echo -e "=========================================================================" + echo -e "Checking Filtered cases for ${ut_suite}" + echo -e "=========================================================================" + if [[ -f "failures_${ut_suite}.log" ]]; then + check_filtered_logs failures_${ut_suite}.log Known_issue.log + num_filtered=$(wc -l < "./failures_${ut_suite}_removed.log") + if [[ $num_filtered -gt 0 ]]; then + echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m" + awk -F':' '{ + line_number = $1 + $1 = "" + gsub(/^ /, "", $0) + printf "\033[33m%3d\033[0m: %s\n", line_number, $0 + }' "failures_${ut_suite}_removed.log" + else + echo -e "\n\033[1;32mNo Skipped Cases\033[0m" + fi else - num_failed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_failed.log") + echo -e "\033[1;32mNo need to check filtered cases\033[0m" fi echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} with only" - echo -e "=========================================================================" - cat "./${ut_suite}_with_only_test_failed.log" - echo -e "=========================================================================" - echo -e "Checking Failed cases in ${ut_suite} with only" + echo -e "Checking New passed cases in Known issue list for ${ut_suite}" echo -e "=========================================================================" - compare_and_filter_logs "${ut_suite}"_with_only_test_failed.log Known_issue.log + check_passed_known_issues passed_${ut_suite}.log Known_issue.log echo -e "=========================================================================" - echo -e "Checking New passed cases in Known issue list for ${ut_suite} with only" + echo -e "Checking New Failed cases in ${ut_suite}" echo -e "=========================================================================" - check_passed_known_issues "${ut_suite}"_with_only_test_passed.log Known_issue.log - if [[ -f "${ut_suite}_with_only_test_failed_filtered.log" ]]; then - num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed_filtered.log") + if [[ -f "failures_${ut_suite}.log" ]]; then + check_new_failed failures_${ut_suite}.log Known_issue.log else - num_failed_with_only=$(wc -l < "./${ut_suite}_with_only_test_failed.log") + echo -e "\033[1;32mNo need to check failed cases\033[0m" fi - ((num_failed=num_failed_with_skip+num_failed_with_only)) - num_passed_with_skip=$(wc -l < "./${ut_suite}_with_skip_test_passed.log") - num_passed_with_only=$(wc -l < "./${ut_suite}_with_only_test_passed.log") - ((num_passed=num_passed_with_skip+num_passed_with_only)) - if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]]; then - echo -e "[ERROR] UT ${ut_suite} test Fail" - exit 1 + + if [[ -f "failures_${ut_suite}_filtered.log" ]]; then + num_failed=$(wc -l < "./failures_${ut_suite}_filtered.log") + elif [[ -f "failures_${ut_suite}.log" ]]; then + num_failed=$(wc -l < "./failures_${ut_suite}.log") else - echo -e "[PASS] UT ${ut_suite} test Pass" + num_failed=0 fi -fi -if [[ "${ut_suite}" == 'torch_xpu' ]]; then - echo "Pytorch XPU binary UT checking" - cd ../../pytorch || exit - for xpu_case in build/bin/*{xpu,sycl}*; do - if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then - case_name=$(basename "$xpu_case") - cd ../ut_log/torch_xpu || exit - grep -E "FAILED" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_failed.log - wc -l < "./binary_ut_${ut_suite}_${case_name}_failed.log" | tee -a ./binary_ut_"${ut_suite}"_failed_summary.log - grep -E "PASSED|Pass" binary_ut_"${ut_suite}"_"${case_name}"_test.log | awk '{print $2}' > ./binary_ut_"${ut_suite}"_"${case_name}"_passed.log - wc -l < "./binary_ut_${ut_suite}_${case_name}_passed.log" | tee -a ./binary_ut_"${ut_suite}"_passed_summary.log - cd - || exit - fi - done + num_passed=$(wc -l < "./passed_${ut_suite}.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite}" + echo -e "Provide the reproduce command for ${ut_suite}" echo -e "=========================================================================" - cd ../ut_log/torch_xpu || exit - cat "./binary_ut_${ut_suite}_${case_name}_failed.log" - num_failed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_failed_summary.log) - num_passed_binary_ut=$(awk '{sum += $1};END {print sum}' binary_ut_"${ut_suite}"_passed_summary.log) - ((num_failed=num_failed_binary_ut)) - if [[ $num_failed -gt 0 ]] || [[ $num_passed_binary_ut -le 0 ]]; then + if [[ $num_failed -gt 0 ]]; then + echo -e "Need reproduce command" + if [[ -f "reproduce_${ut_suite}.log" ]]; then + cat "./reproduce_${ut_suite}.log" + fi + else + echo -e "Not need reproduce command" + fi + if [[ $num_failed -gt 0 ]] || [[ $num_passed -le 0 ]] || [[ "$all_pass" == 'false' ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 else echo -e "[PASS] UT ${ut_suite} test Pass" fi fi + if [[ "${ut_suite}" == 'xpu_distributed' ]]; then grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log @@ -232,13 +205,29 @@ if [[ "${ut_suite}" == 'xpu_distributed' ]]; then echo -e "=========================================================================" cat "./${ut_suite}_xpu_distributed_test_failed.log" echo -e "=========================================================================" - echo -e "Checking Failed cases in ${ut_suite} xpu distributed" + echo -e "Checking Filtered cases for ${ut_suite} xpu distributed" echo -e "=========================================================================" - compare_and_filter_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log + check_filtered_logs "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log + num_filtered_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_removed.log") + if [[ $num_filtered_xpu_distributed -gt 0 ]]; then + echo -e "\n\033[1;31m[These failed cases are in skip list, will filter]\033[0m" + awk -F':' '{ + line_number = $1 + $1 = "" + gsub(/^ /, "", $0) + printf "\033[33m%3d\033[0m: %s\n", line_number, $0 + }' "${ut_suite}_xpu_distributed_test_failed_removed.log" + else + echo -e "\n\033[1;32mNo Skipped Cases\033[0m" + fi echo -e "=========================================================================" echo -e "Checking New passed cases in Known issue list for ${ut_suite}" echo -e "=========================================================================" check_passed_known_issues "${ut_suite}"_xpu_distributed_test_passed.log Known_issue.log + echo -e "=========================================================================" + echo -e "Checking Failed cases in ${ut_suite} xpu distributed" + echo -e "=========================================================================" + check_new_failed "${ut_suite}"_xpu_distributed_test_failed.log Known_issue.log if [[ -f "${ut_suite}_xpu_distributed_test_failed_filtered.log" ]]; then num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed_filtered.log") else diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml index a40601d625..decf4e612f 100644 --- a/.github/workflows/_linux_accelerate.yml +++ b/.github/workflows/_linux_accelerate.yml @@ -26,7 +26,7 @@ on: runner: required: true type: string - default: 'linux.idc.xpu' + default: 'pvc_rolling' description: Runner label accelerate: required: false @@ -45,11 +45,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash {0} + jobs: conditions-filter: name: conditions-filter if: ${{ github.event.pull_request.draft == false }} - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 timeout-minutes: 10 env: GH_TOKEN: ${{ github.token }} @@ -66,22 +70,46 @@ jobs: disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)" echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" - Torch-XPU-Accelerate-Tests: - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + prepare: + runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }} needs: conditions-filter if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_accelerate')) }} + outputs: + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner + + tests: + runs-on: ${{ needs.prepare.outputs.runner_id }} + needs: prepare + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }} + --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.prepare.outputs.user_id }} + -e ZE_AFFINITY_MASK + env: + WORK_DIR: 'accelerate' + PYTORCH_DEBUG_XPU_FALLBACK: 1 + HF_HUB_ETAG_TIMEOUT: 120 + HF_HUB_DOWNLOAD_TIMEOUT: 120 + PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }} env: - WORK_DIR: 'accelerate' - NEOReadDebugKeys: 0 - DisableScratchPages: 0 accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.6.0' }} transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} - PYTORCH_DEBUG_XPU_FALLBACK: 1 - ZE_AFFINITY_MASK: 0 - PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py - HF_HUB_ETAG_TIMEOUT: 120 - HF_HUB_DOWNLOAD_TIMEOUT: 120 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -93,25 +121,22 @@ jobs: repository: huggingface/accelerate ref: ${{ env.accelerate }} path: accelerate - - name: Create unique Conda ENV name + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Check python run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + - name: Install pytorch and deps run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME - pip install junitparser pytest-timeout + pip install junitparser pip install transformers==${{ env.transformers }} - - name: Prepare Stock XPU Pytorch - run: | - source activate $CONDA_ENV_NAME pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu - name: Prepare Accelerate run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR pip install -e . pip install -e ".[testing]" @@ -120,7 +145,6 @@ jobs: cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ - name: Report installed versions run: | - source activate $CONDA_ENV_NAME echo "pip installed packages:" pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt echo "lspci gpu devices:" @@ -131,16 +155,17 @@ jobs: xpu-smi discovery -y --json --dump -1 - name: Sanity check installed packages run: | - source activate $CONDA_ENV_NAME + # Use latest pytest + pip install -U pytest pytest-timeout pytest-xdist # These checks are to exit earlier if for any reason torch # packages were reinstalled back to CUDA versions (not expected). pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - - name: Run tests + printenv + - name: Run tests on ${{ needs.prepare.outputs.hostname }} run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR && rm -rf reports && mkdir -p reports # Excluding tests due to: # * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on @@ -150,8 +175,7 @@ jobs: # * tests/test_big_modeling.py::test_dispatch_model_tied_weights_memory_with_nested_offload_cpu fails # with OOM. That's a new test added by https://github.com/huggingface/accelerate/pull/3445 pattern="not test_profiler and not test_gated and not test_dispatch_model_tied_weights_memory_with_nested_offload_cpu" - cmd=(python3 -m pytest --timeout 600 -rsf --junitxml=reports/accelerate.xml -k "$pattern" \ - tests/) + cmd=(python -m pytest --junitxml=reports/accelerate.xml -k "$pattern" tests/) { echo "### Running" echo "\`\`\`" @@ -162,28 +186,20 @@ jobs: - name: Print result tables if: ${{ ! cancelled() }} run: | - source activate $CONDA_ENV_NAME cd $WORK_DIR { echo "### Results" - python3 $PARSE_JUNIT reports/accelerate.xml --stats + python $PARSE_JUNIT reports/accelerate.xml --stats echo "### Failed" - python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed + python $PARSE_JUNIT reports/accelerate.xml --errors --failed echo "### Skipped" - python3 $PARSE_JUNIT reports/accelerate.xml --skipped + python $PARSE_JUNIT reports/accelerate.xml --skipped } >> $GITHUB_STEP_SUMMARY - name: Print environment if: ${{ ! cancelled() }} uses: ./torch-xpu-ops/.github/actions/print-environment with: - conda: $CONDA_ENV_NAME pip_packages: 'accelerate transformers' - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - fi - name: Upload Test log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index d7257a4e77..ab49d28490 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -46,6 +46,10 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} jobs: runner: @@ -79,14 +83,11 @@ jobs: options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: - AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - MODEL_ONLY_NAME: ${{ inputs.model }} xpu_num: ${{ needs.runner.outputs.xpu_num }} cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }} - env: - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + MODEL_ONLY_NAME: ${{ inputs.model }} + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + dataset_dir: ${{ runner.temp }}/../_datasets/imagenet steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -189,6 +190,7 @@ jobs: # On-demand launch - name: Get test data types + if: ${{ contains(inputs.test_type, 'ondemand') }} id: dtypes run: | e2e_dtypes="$(echo "${{ inputs.dt }}" |sed 's/,int8//;s/int8,//')" diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index 5231a29419..b65d985008 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -22,6 +22,11 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + reference_issue: 1689 jobs: runner: @@ -56,19 +61,15 @@ jobs: -u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /opt/xpu-tool - REFERENCE_ISSUE: 1689 - env: - GH_TOKEN: ${{ github.token }} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Prepare test env on ${{ needs.runner.outputs.hostname }} + - name: Prepare test env uses: ./.github/actions/linux-testenv with: pytorch: ${{ inputs.pytorch }} python: ${{ inputs.python }} - - - name: Run Torch XPU Op Benchmark + - name: Run Torch XPU Op Benchmark on ${{ needs.runner.outputs.hostname }} run: | mkdir -p ${{ github.workspace }}/op_benchmark cd test/microbench @@ -92,9 +93,6 @@ jobs: op_benchmark_test_results_check: needs: op_benchmark runs-on: ubuntu-24.04 - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1689 steps: - name: Install gh-cli run: | @@ -155,6 +153,6 @@ jobs: path: ${{ github.workspace }}/op_benchmark - name: Upload Reference Run ID run: | - gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE} --json body -q .body | \ + gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} --json body -q .body | \ sed "s/Inductor-XPU-OP-Benchmark-Data:.*/Inductor-XPU-OP-Benchmark-Data: ${GITHUB_RUN_ID}/" | sed '/^$/d' > new_body.txt - gh --repo ${GITHUB_REPOSITORY} issue edit ${REFERENCE_ISSUE} --body-file new_body.txt + gh --repo ${GITHUB_REPOSITORY} issue edit ${reference_issue} --body-file new_body.txt diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index 2d8bfd4af2..fc3deaf7a3 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -21,7 +21,7 @@ on: runner: required: true type: string - default: 'linux.idc.xpu' + default: 'pvc_rolling' description: Runner label driver: required: false @@ -58,8 +58,6 @@ env: HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HF_HUB_ETAG_TIMEOUT: 120 HF_HUB_DOWNLOAD_TIMEOUT: 120 - NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} - DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.7.0'}} datasets: ${{ inputs.datasets != '' && inputs.datasets || 'v3.6.0'}} @@ -76,8 +74,12 @@ env: libswresample-dev libswscale-dev pciutils - PYTEST_TIMEOUT: 600 TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu' + AGENT_TOOLSDIRECTORY: /tmp/xpu-tool + +defaults: + run: + shell: bash {0} jobs: conditions-filter: @@ -101,7 +103,7 @@ jobs: echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" prepare: - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }} needs: conditions-filter if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }} outputs: @@ -109,6 +111,11 @@ jobs: torchvision: ${{ steps.getver.outputs.torchvision }} torchaudio: ${{ steps.getver.outputs.torchaudio }} triton: ${{ steps.getver.outputs.triton }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} + pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }} steps: - id: getver run: | @@ -127,10 +134,28 @@ jobs: echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT" echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT" echo "triton=$triton" | tee -a "$GITHUB_OUTPUT" + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner + id: runner-info + uses: ./.github/actions/get-runner tests: needs: prepare - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ${{ needs.prepare.outputs.runner_id }} + container: + image: mengfeili/intel-pvc-driver:1146-1136 + volumes: + - ${{ github.workspace }}:${{ github.workspace }} + options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }} + --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g + -u ${{ needs.prepare.outputs.user_id }} + -e ZE_AFFINITY_MASK + env: + PYTORCH_DEBUG_XPU_FALLBACK: '1' + TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' + # enable pytest parallel run, and continue others if meets crash case such as segmentation fault + PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }} strategy: fail-fast: false max-parallel: 1 @@ -152,52 +177,16 @@ jobs: # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) # * https://github.com/huggingface/transformers/issues/36267 (marian tests) - test_case: 'tests_models_0' - cmd: 'tests/models --num-shards 16 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_1' - cmd: 'tests/models --num-shards 16 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_2' - cmd: 'tests/models --num-shards 16 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - test_case: 'tests_models_3' - cmd: 'tests/models --num-shards 16 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_4' - cmd: 'tests/models --num-shards 16 --shard-id 4 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_5' - cmd: 'tests/models --num-shards 16 --shard-id 5 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_6' - cmd: 'tests/models --num-shards 16 --shard-id 6 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_7' - cmd: 'tests/models --num-shards 16 --shard-id 7 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_8' - cmd: 'tests/models --num-shards 16 --shard-id 8 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_9' - cmd: 'tests/models --num-shards 16 --shard-id 9 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_10' - cmd: 'tests/models --num-shards 16 --shard-id 10 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_11' - cmd: 'tests/models --num-shards 16 --shard-id 11 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_12' - cmd: 'tests/models --num-shards 16 --shard-id 12 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_13' - cmd: 'tests/models --num-shards 16 --shard-id 13 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_14' - cmd: 'tests/models --num-shards 16 --shard-id 14 --ignore=tests/models/marian/test_modeling_marian.py' - filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' - - test_case: 'tests_models_15' - cmd: 'tests/models --num-shards 16 --shard-id 15 --ignore=tests/models/marian/test_modeling_marian.py' + cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py' filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' # Excluding tests due to: # * Some ray tests hang, reason unknown @@ -212,9 +201,6 @@ jobs: - test_case: 'tests_utils' cmd: '--ignore=tests/utils/test_import_utils.py tests/utils' filter: 'not test_load_img_url_timeout' - env: - PYTORCH_DEBUG_XPU_FALLBACK: '1' - TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -253,20 +239,18 @@ jobs: sleep 1; if (( $SECONDS - start_time > 60 )); then false; fi done - - name: Create unique Conda ENV name - run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Check python run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME - pip install junitparser pytest-shard pytest-timeout - - name: Prepare Stock XPU Pytorch + which python && python -V + which pip && pip list + pip install -U pip wheel setuptools + - name: Prepare pytorch and deps run: | - source activate $CONDA_ENV_NAME + pip install junitparser pip install $TORCH_INDEX \ torch==${{ needs.prepare.outputs.torch }} \ torchvision==${{ needs.prepare.outputs.torchvision }} \ @@ -275,7 +259,6 @@ jobs: - name: Prepare Transformers run: | pwd - source activate $CONDA_ENV_NAME cd transformers pip install \ accelerate==${{ env.accelerate }} \ @@ -287,7 +270,6 @@ jobs: cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ - name: Report installed versions run: | - source activate $CONDA_ENV_NAME LOGS_DIR="${{ github.workspace }}/transformers/logs" echo "pip installed packages:" pip list | tee "$LOGS_DIR/pip_list-$TEST_CASE.txt" @@ -299,36 +281,35 @@ jobs: xpu-smi discovery -y --json --dump -1 - name: Sanity check installed packages run: | - source activate $CONDA_ENV_NAME + # Use latest pytest + pip install -U pytest pytest-timeout pytest-xdist pytest-shard # These checks are to exit earlier if for any reason Transformers # reinstalled torch packages back to CUDA versions (not expected). pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - - name: Run tests + - name: Run tests on ${{ needs.prepare.outputs.hostname }} run: | - source activate $CONDA_ENV_NAME cd transformers - python3 -m pytest -rsf --make-reports=$TEST_CASE --junit-xml=reports/$TEST_CASE.xml \ - -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true + python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \ + -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true - name: Check for errors in tests run: | - source activate $CONDA_ENV_NAME - python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml + python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml - name: Print environment if: ${{ ! cancelled() }} uses: ./torch-xpu-ops/.github/actions/print-environment with: - conda: $CONDA_ENV_NAME pip_packages: 'accelerate datasets transformers' to: 'transformers/logs/environment-$TEST_CASE.md' - name: Clean up if: ${{ always() }} run: | - du -sh ${{ env.HF_HOME }} || true - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME + if [ -d "$HF_HOME" ]; then + ls -al ${{ env.HF_HOME }} + du -sh ${{ env.HF_HOME }} + rm -rf ${{ env.HF_HOME }} fi - name: Upload reports if: ${{ ! cancelled() }} @@ -346,7 +327,7 @@ jobs: report: needs: tests if: ${{ success() || failure() }} - runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + runs-on: ubuntu-24.04 steps: - name: Download reports uses: actions/download-artifact@v4 @@ -366,16 +347,12 @@ jobs: uses: actions/checkout@v4 with: path: torch-xpu-ops - - name: Create unique Conda ENV name - run: | - random=$(head /dev/urandom | tr -dc A-Za-z0-9_ | head -c ${1:-5} | xargs) - echo "CONDA_ENV_NAME=hf_transformers_test_${ZE_AFFINITY_MASK}_${random}" >> $GITHUB_ENV - - name: Prepare Conda ENV + - name: Setup python-${{ env.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python }} + - name: Install pip deps run: | - echo "Using Conda ENV name: $CONDA_ENV_NAME" - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} - source activate $CONDA_ENV_NAME pip install junitparser - name: Print results table if: ${{ ! cancelled() }} @@ -409,8 +386,7 @@ jobs: - name: Print baseline difference if: ${{ ! cancelled() }} run: | - source activate $CONDA_ENV_NAME - python3 torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true + python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true - name: Print failure lines if: ${{ ! cancelled() }} run: | @@ -477,9 +453,3 @@ jobs: for f in $(find transformers/logs -name "environment-*.md"); do diff $f $first_md done - - name: Clean up - if: ${{ always() }} - run: | - if [ -n "$CONDA_ENV_NAME" ]; then - conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME - fi diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 146db6c72e..1751d9d972 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -29,6 +29,11 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + UT_SKIP_ISSUE: 1624 jobs: runner: @@ -50,10 +55,10 @@ jobs: id: runner-info uses: ./.github/actions/get-runner - docker: + test-in-container: needs: runner - runs-on: ${{ needs.runner.outputs.runner_id }} if: ${{ ! contains(inputs.ut, 'distributed') }} + runs-on: ${{ needs.runner.outputs.runner_id }} container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: @@ -64,30 +69,19 @@ jobs: env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }} - env: - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Prepare test env on ${{ needs.runner.outputs.hostname }} + - name: Prepare test env uses: ./.github/actions/linux-testenv with: pytorch: ${{ inputs.pytorch }} torch_xpu_ops: ${{ inputs.torch_xpu_ops }} python: ${{ inputs.python }} - - name: Run XPU UT Test + - name: Run XPU UT Test on ${{ needs.runner.outputs.hostname }} uses: ./.github/actions/linux-uttest with: ut_name: ${{ inputs.ut }} - - name: UT Test Results Summary - run: | - pip install junitparser - python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true - if [ -e ut_failure_list.csv ];then - cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true - fi - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 @@ -102,47 +96,27 @@ jobs: name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv if-no-files-found: ignore - - name: Reset Ptrace_scope - if: ${{ always() }} - run: | - if [ -f "${{ github.workspace }}/ptrace_scope.bk" ]; then - sudo cp ${{ github.workspace }}/ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - fi - host: + test-in-baremetal: needs: runner - runs-on: ${{ needs.runner.outputs.runner_id }} if: ${{ contains(inputs.ut, 'distributed') }} + runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - GH_TOKEN: ${{ github.token }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1 steps: - - name: Init test - run: | - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf ~/.triton /tmp/*inductor* /tmp/tmp* - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Prepare test env on ${{ needs.runner.outputs.runner_id }} + - name: Prepare test env uses: ./.github/actions/linux-testenv with: pytorch: ${{ inputs.pytorch }} torch_xpu_ops: ${{ inputs.torch_xpu_ops }} python: ${{ inputs.python }} - - name: Run XPU UT Test + - name: Run XPU UT Test on ${{ needs.runner.outputs.hostname }} uses: ./.github/actions/linux-uttest with: ut_name: ${{ inputs.ut }} - - name: UT Test Results Summary - run: | - pip install junitparser - python ./.github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml >> $GITHUB_STEP_SUMMARY || true - if [ -e ut_failure_list.csv ];then - cp ut_failure_list.csv ${{ github.workspace }}/ut_log/ut_failure_list.csv || true - fi - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 @@ -157,21 +131,12 @@ jobs: name: XPU-UT-Failure-List-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }} path: ${{ github.workspace }}/ut_log/ut_failure_list.csv if-no-files-found: ignore - - name: Reset Ptrace_scope - if: ${{ always() }} - run: | - if [ -f "${{ github.workspace }}/ptrace_scope.bk" ]; then - sudo cp ${{ github.workspace }}/ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - fi summary: - needs: [docker, host] - if: ${{ ! cancelled() && ! (endsWith(needs.docker.result, 'ed') && endsWith(needs.host.result, 'ed')) }} + needs: [test-in-container, test-in-baremetal] + if: ${{ ! cancelled() && ! (endsWith(needs.test-in-container.result, 'ed') && endsWith(needs.test-in-baremetal.result, 'ed')) }} runs-on: ubuntu-24.04 timeout-minutes: 30 - env: - GH_TOKEN: ${{ github.token }} - UT_SKIP_ISSUE: 1624 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -181,15 +146,22 @@ jobs: name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-${{ inputs.ut }} path: ${{ github.workspace }}/ut_log - name: Check UT Results - shell: bash run: | ls -al ${{ github.workspace }}/ut_log cd ${{ github.workspace }}/ut_log/${{ inputs.ut }} + + for log_file in "${{ github.workspace }}/ut_log"/{failures,passed,category,reproduce}_*.log; do + [[ -f "$log_file" ]] && cp "$log_file" ./ + done # get distributed known issues gh --repo intel/torch-xpu-ops issue view $UT_SKIP_ISSUE --json body -q .body |sed -E '/^(#|$)/d' > Known_issue.log.tmp # get skipped known issues - gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log + count=$(gh api "repos/${{ github.repository }}/issues?labels=skipped" --jq 'length') + if [ "$count" -gt 0 ]; then + echo -e "$count issues with skipped label found" + gh api "repos/${{ github.repository }}/issues?labels=skipped" \ + --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log + fi if [ "${{ inputs.ut }}" == "basic" ];then ut_list="op_regression op_transformers op_extended op_regression_dev1" else @@ -201,7 +173,7 @@ jobs: cp Known_issue.log.tmp Known_issue.log awk -v r="${ut_name}" 'BEGIN{ print_row = 0 }{ if ( ! ( $0 ~ /[a-zA-Z0-9]/ ) ) { print_row = 0 }; - if ( print_row == 1 && $NF ~ r ) { print $1 }; + if ( print_row == 1 && $1 ~ r ) { print $0 }; if ( $0 ~ /Cases:/ ) { print_row = 1 }; }' issues.log >> Known_issue.log bash ut_result_check.sh ${ut_name} diff --git a/.github/workflows/_windows_ut.yml b/.github/workflows/_windows_ut.yml index 14129446e2..e650e32d14 100644 --- a/.github/workflows/_windows_ut.yml +++ b/.github/workflows/_windows_ut.yml @@ -112,7 +112,7 @@ jobs: call conda activate windows_ci cd ../pytorch pip install -r requirements.txt - pip install cmake setuptools==72.1.0 clang-format + pip install cmake setuptools clang-format pip install mkl-static mkl-include set USE_STATIC_MKL=1 copy "%CONDA_PREFIX%\Library\bin\libiomp*5md.dll" .\torch\lib @@ -123,7 +123,8 @@ jobs: set CMAKE_PREFIX_PATH="%CONDA_PREFIX%\Library" ) python setup.py clean - set MAX_JOBS=4 + set MAX_JOBS=32 + set TORCH_XPU_ARCH_LIST=mtl-h,bmg,lnl-m python setup.py bdist_wheel > build_torch_wheel_log.log echo "[INFO] begin to install torch whls" for /r C:\actions-runner\_work\torch-xpu-ops\pytorch\dist %%i in (torch*.whl) do ( @@ -141,24 +142,22 @@ jobs: python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" - - name: Upload Windows build log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Windows-Log-${{ github.event.pull_request.number || github.sha }} path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\build_torch_wheel_log.log' - - name: Upload Windows binary if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Torch-XPU-Windows-Binary-${{ github.event.pull_request.number || github.sha }} path: 'C:\actions-runner\_work\torch-xpu-ops\pytorch\dist' - - name: Run XPU OP Extended UT if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule' shell: cmd + continue-on-error: true run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat" @@ -168,9 +167,12 @@ jobs: cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/ python run_test_with_skip_mtl.py + if not exist "%GITHUB_WORKSPACE%\ut_log" mkdir "%GITHUB_WORKSPACE%\ut_log" + copy op_extended.xml %GITHUB_WORKSPACE%\ut_log /Y - name: Run Test XPU UT - if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule' + if: contains(inputs.ut, 'test_xpu') || github.event_name == 'schedule' shell: cmd + continue-on-error: true run: | call "C:\ProgramData\miniforge3\Scripts\activate.bat" call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat" @@ -178,3 +180,103 @@ jobs: call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" cd ../pytorch/third_party/torch-xpu-ops/test/xpu/ python run_test_win_with_skip_mtl.py + + if not exist "%GITHUB_WORKSPACE%\ut_log" mkdir "%GITHUB_WORKSPACE%\ut_log" + copy test_xpu.xml %GITHUB_WORKSPACE%\ut_log /Y + - name: UT Test Results Summary + shell: cmd + run: | + call conda activate windows_ci + pip install junitparser + echo "GITHUB_WORKSPACE: %GITHUB_WORKSPACE%" + for %%i in ("%GITHUB_WORKSPACE%\ut_log\*.xml") do ( + python .\.github\scripts\check-ut.py "%%i" >> "%GITHUB_STEP_SUMMARY%" + ) + @echo off + + REM Check the failure logs + if exist "%GITHUB_WORKSPACE%\failures*.log" ( + echo Exist Failure logs + echo Found Failure logs as below: + for %%f in ("%GITHUB_WORKSPACE%\failures*.log") do ( + echo - %%f + copy "%%f" "%GITHUB_WORKSPACE%\ut_log\" + ) + echo Failure logs Copied + ) else ( + echo No Failure logs + ) + + REM Copied the passed logs + if exist "passed*.log" ( + copy "passed*.log" "%GITHUB_WORKSPACE%\ut_log\" + echo Passed logs Copied + ) else ( + echo No Passed logs + ) + + REM Copied the Summary logs + if exist "category*.log" ( + copy "category*.log" "%GITHUB_WORKSPACE%\ut_log\" + echo Category logs Copied + ) else ( + echo No Category logs + ) + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows + path: "${{ github.workspace }}/ut_log" + if-no-files-found: ignore + + summary: + needs: [ut_test] + runs-on: ubuntu-24.04 + timeout-minutes: 30 + env: + GH_TOKEN: ${{ github.token }} + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Download XPU UT Logs + uses: actions/download-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows + path: ${{ github.workspace }}/ut_log + - name: Check UT Results + shell: bash + run: | + ls -al ${{ github.workspace }}/ut_log + cd ${{ github.workspace }}/ut_log + + # get skipped known issues + count=$(gh api "repos/${{ github.repository }}/issues?labels=skipped_windows" --jq 'length') + if [ "$count" -gt 0 ]; then + echo -e "$count issues with skipped label found" + gh api "repos/${{ github.repository }}/issues?labels=skipped_windows" \ + --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' > issues.log + fi + + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ + for ut_name in $(echo ${{ inputs.ut }} |sed 's/,/ /g') + do + touch Known_issue.log + if [ -f "issues.log" ]; then + awk -v r="${ut_name}" 'BEGIN{ print_row = 0 }{ + if ( ! ( $0 ~ /[a-zA-Z0-9]/ ) ) { print_row = 0 }; + if ( print_row == 1 && $1 ~ r ) { print $0 }; + if ( $0 ~ /Cases:/ ) { print_row = 1 }; + }' issues.log > Known_issue.log + else + echo "Info: issues.log not found or empty, using empty Known_issue.log" + fi + bash ut_result_check.sh ${ut_name} + done + - name: Upload Inductor XPU UT Log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-UT-Data-${{ github.event.pull_request.number || github.sha }}-Windows + path: ${{ github.workspace }}/ut_log + overwrite: true diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml index b4e657b847..4adba0df6c 100644 --- a/.github/workflows/bisect_search.yml +++ b/.github/workflows/bisect_search.yml @@ -37,6 +37,10 @@ on: permissions: read-all +defaults: + run: + shell: bash -xe {0} + jobs: get_runner: runs-on: ${{ inputs.runner }} @@ -87,12 +91,9 @@ jobs: USE_XCCL: 0 USE_KINETO: 0 env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - GH_TOKEN: ${{ github.token }} - defaults: - run: - shell: bash -xe {0} + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Check runner run: | diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 51ef597765..d408622efe 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -95,7 +95,7 @@ jobs: echo "No such scheduler: ${{ github.event.schedule }}" exit 1 fi - ut='["basic","op_ut","xpu_profiling","torch_xpu","xpu_distributed"]' + ut='["basic","op_ut","xpu_profiling","xpu_distributed"]' suite='["huggingface","timm_models","torchbench","pt2e"]' triton='' python='3.10' @@ -144,6 +144,7 @@ jobs: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'p') }} name: linux-ut needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + secrets: inherit strategy: fail-fast: false matrix: @@ -160,6 +161,7 @@ jobs: if: ${{ github.event_name == 'schedule' || contains(inputs.suite, 'e') }} name: linux-e2e needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + secrets: inherit strategy: fail-fast: false matrix: @@ -189,6 +191,7 @@ jobs: name: linux-microbench permissions: write-all needs: [Conditions-Filter, Linux-Nightly-Ondemand-Build] + secrets: inherit strategy: fail-fast: false matrix: @@ -203,9 +206,10 @@ jobs: if: ${{ github.event_name == 'schedule' || contains(inputs.ut, 'windows') }} name: windows needs: [Conditions-Filter] + secrets: inherit uses: ./.github/workflows/_windows_ut.yml with: - ut: 'op_extended,torch_xpu' + ut: 'op_extended,test_xpu' python: ${{ needs.Conditions-Filter.outputs.python }} src_changed: false has_label: true diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bfb2d913a8..be6b260734 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -60,6 +60,7 @@ jobs: src_changed: ${{ steps.check-files.outputs.src_changed }} has_label: ${{ steps.check-label.outputs.has_label }} disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }} + pytorch: ${{ contains(steps.check-pr-desc.outputs.disabled_tests, 'disable_build') && 'nightly_wheel' || github.base_ref }} steps: - uses: dorny/paths-filter@v2 id: check-files @@ -90,7 +91,7 @@ jobs: echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" linux-build: - if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all')}} + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') }} needs: [conditions-filter, preci-lint-check] secrets: inherit strategy: @@ -100,11 +101,12 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: runner: pvc_rolling - pytorch: main + pytorch: ${{ needs.conditions-filter.outputs.pytorch }} linux-ut: needs: [conditions-filter, linux-build] - if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_ut')}} + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_ut') }} + secrets: inherit strategy: fail-fast: false matrix: @@ -112,12 +114,13 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: pvc_rolling - pytorch: main + pytorch: ${{ needs.conditions-filter.outputs.pytorch }} ut: ${{ matrix.ut_name }} linux-distributed: needs: [conditions-filter, linux-build] - if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_distribute')}} + if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_distribute') }} + secrets: inherit strategy: fail-fast: false matrix: @@ -125,12 +128,13 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: pvc_rolling - pytorch: main + pytorch: ${{ needs.conditions-filter.outputs.pytorch }} ut: ${{ matrix.ut_name }} linux-e2e: name: linux-e2e if: ${{ !contains(needs.conditions-filter.outputs.disabled_tests, 'disable_e2e') }} + secrets: inherit needs: [conditions-filter, linux-build] strategy: fail-fast: false @@ -139,7 +143,7 @@ jobs: uses: ./.github/workflows/_linux_e2e.yml with: runner: pvc_rolling - pytorch: main + pytorch: ${{ needs.conditions-filter.outputs.pytorch }} suite: ${{ matrix.suite }} linux-e2e-summary: name: linux-e2e-summary @@ -152,9 +156,10 @@ jobs: name: windows if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_win')) }} needs: [conditions-filter, preci-lint-check] + secrets: inherit uses: ./.github/workflows/_windows_ut.yml with: - ut: op_extended,torch_xpu + ut: op_extended,test_xpu runner: Windows_CI src_changed: ${{ needs.conditions-filter.outputs.src_changed }} has_label: ${{ needs.conditions-filter.outputs.has_label }} diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp index b011baa80b..0fe0ba2636 100644 --- a/src/ATen/native/xpu/Copy.cpp +++ b/src/ATen/native/xpu/Copy.cpp @@ -71,7 +71,12 @@ void memcpyAsync( Device dst_device = iter.device(0); Device src_device = iter.device(1); if (dst_device == src_device) { - copy_kernel(iter); + // copy_kernel(iter); + auto dst = (char*)iter.data_ptr(0); + auto src = (char*)iter.data_ptr(1); + size_t size = iter.numel() * iter.element_size(0); + auto q = copy_stream.queue(); + q.copy(src, dst, size); } else { TORCH_INTERNAL_ASSERT(p2p_enabled == true); auto dst = (char*)iter.data_ptr(0); diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp index 011d1ab451..01016494cf 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -627,6 +628,220 @@ struct AdaptiveAvgPool2dKernelFunctor { PackedTensorAccessor64 output_; }; +template +struct AdaptiveAvgPool2dKernelFunctor_cl { + void operator()(sycl::nd_item<1> item) const { + int64_t index = item.get_global_linear_id(); + if (index < numel_) { + int _ow, _oh, _oc, _ob; + int oc_vec_ = oc_ / vec_size; + + _oc = index % oc_vec_; + _ow = index / oc_vec_ % ow_; + _oh = index / oc_vec_ / ow_ % oh_; + _ob = index / oc_vec_ / ow_ / oh_; + + int64_t _ih0 = native::start_index(_oh, oh_, ih_); + int64_t _ih1 = native::end_index(_oh, oh_, ih_); + int64_t _iw0 = native::start_index(_ow, ow_, iw_); + int64_t _iw1 = native::end_index(_ow, ow_, iw_); + int64_t kh = _ih1 - _ih0; + int64_t kw = _iw1 - _iw0; + int64_t _ib = _ob; + int64_t _ic = _oc; + + opmath_t sum[vec_size] = {static_cast(0)}; + for (int _ih = _ih0; _ih < _ih1; _ih++) { + for (int _iw = _iw0; _iw < _iw1; _iw++) { + auto read = input_ + [_ic + _iw * oc_vec_ + _ih * oc_vec_ * iw_ + + _ib * ih_ * iw_ * oc_vec_]; +#pragma unroll + for (int v = 0; v < vec_size; v++) { + sum[v] += opmath_t(read[v]); + } + } + } +#pragma unroll + for (int v = 0; v < vec_size; v++) { + sum[v] /= kh * kw; + } + vec_t output_value; +#pragma unroll + for (int v = 0; v < vec_size; v++) { + output_value[v] = static_cast(sum[v]); + } + output_[index] = output_value; + } + } + AdaptiveAvgPool2dKernelFunctor_cl( + vec_t* output, + const vec_t* input, + int ih, + int iw, + int ob, + int oc, + int oh, + int ow, + int64_t numel) + : output_(output), + input_(input), + ih_(ih), + iw_(iw), + ob_(ob), + oc_(oc), + oh_(oh), + ow_(ow), + numel_(numel) {} + + private: + int ih_; + int iw_; + int ob_; + int oc_; + int oh_; + int ow_; + int64_t numel_; + const vec_t* input_; + vec_t* output_; +}; + +#define LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( \ + scalar_t, \ + opmath_t, \ + vec_size, \ + num_wg, \ + wg_size, \ + queue, \ + output, \ + input, \ + ih, \ + iw, \ + ob, \ + oc, \ + oh, \ + ow, \ + numel) \ + { \ + using vec_t = memory::aligned_vector; \ + vec_t* output_vec = \ + reinterpret_cast(output.mutable_data_ptr()); \ + const vec_t* input_vec = \ + reinterpret_cast(input.const_data_ptr()); \ + auto kfn = AdaptiveAvgPool2dKernelFunctor_cl< \ + scalar_t, \ + opmath_t, \ + vec_t, \ + vec_size>(output_vec, input_vec, ih, iw, ob, oc, oh, ow, numel); \ + sycl_kernel_submit(num_wg* wg_size, wg_size, queue, kfn); \ + } + +template +void launch_adaptive_avg_pool2d_kernel_cl(const Tensor& input, Tensor& output) { + int ih = input.size(2); + int iw = input.size(3); + int ob = output.size(0); + int oc = output.size(1); + int oh = output.size(2); + int ow = output.size(3); + + int64_t numel = ob * oc * oh * ow; + int vec_size = 1; + for (vec_size = std::min( + 8, + memory::can_vectorize_up_to( + (char*)output.mutable_data_ptr())); + vec_size > 1; + vec_size /= 2) { + if (oc % vec_size != 0) + continue; + if (2 * numel / vec_size > syclMaxWorkItemsPerTile()) { + numel /= vec_size; + break; + } + } + + auto wg_size = syclDeviceMaxWorkGroupSize(); + int64_t num_wg = (numel + wg_size - 1) / wg_size; + switch (vec_size) { + case 8: + LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( + scalar_t, + opmath_t, + 8, + num_wg, + wg_size, + at::xpu::getCurrentSYCLQueue(), + output, + input, + ih, + iw, + ob, + oc, + oh, + ow, + numel); + return; + case 4: + LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( + scalar_t, + opmath_t, + 4, + num_wg, + wg_size, + at::xpu::getCurrentSYCLQueue(), + output, + input, + ih, + iw, + ob, + oc, + oh, + ow, + numel); + return; + case 2: + LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( + scalar_t, + opmath_t, + 2, + num_wg, + wg_size, + at::xpu::getCurrentSYCLQueue(), + output, + input, + ih, + iw, + ob, + oc, + oh, + ow, + numel); + return; + case 1: + LAUNCH_AVGPOOL_CHANNEL_LAST_VEC( + scalar_t, + opmath_t, + 1, + num_wg, + wg_size, + at::xpu::getCurrentSYCLQueue(), + output, + input, + ih, + iw, + ob, + oc, + oh, + ow, + numel); + return; + default: + TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size"); + } +} +#undef LAUNCH_AVGPOOL_CHANNEL_LAST_VEC + template void launch_adaptive_avg_pool2d_kernel( PackedTensorAccessor64 input, @@ -724,8 +939,13 @@ void adaptive_avg_pool2d_kernel( auto iacc = input_.packed_accessor64(); auto oacc = output.packed_accessor64(); if (is_smf_channels_last(output)) { - launch_adaptive_avg_pool2d_kernel( - iacc, oacc); + if (input_.is_contiguous(at::MemoryFormat::ChannelsLast)) { + launch_adaptive_avg_pool2d_kernel_cl( + input_, output); + } else { + launch_adaptive_avg_pool2d_kernel( + iacc, oacc); + } } else { launch_adaptive_avg_pool2d_kernel( iacc, oacc); diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 2cccaba92b..f8e1b6906e 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -423,7 +423,7 @@ template < typename index_t> struct BatchNormCollectStatisticsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { int plane = item.get_group(1); int tid = item.get_local_linear_id(); @@ -1874,7 +1874,7 @@ template < typename index_t> struct BatchNormBackwardReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); @@ -4162,7 +4162,7 @@ template < typename stat_accscalar_t, typename index_t> struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); index_t N = grad_output_.size(0) * grad_output_.size(2); @@ -4370,7 +4370,7 @@ template < typename index_t> struct BatchNormBackwardVectorizedKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { index_t plane = item.get_group(1); index_t N = grad_output_.size(0) * grad_output_.size(2); diff --git a/src/ATen/native/xpu/sycl/Dequant_int4.cpp b/src/ATen/native/xpu/sycl/Dequant_int4.cpp index b217e18ad1..8a52dbfb32 100644 --- a/src/ATen/native/xpu/sycl/Dequant_int4.cpp +++ b/src/ATen/native/xpu/sycl/Dequant_int4.cpp @@ -22,7 +22,7 @@ struct DequantInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { weight_dequant(weight_dequant) {} void sycl_ker_config_convention(sycl::handler& cgh) {} - [[intel::reqd_sub_group_size(SgSize)]] void operator()( + [[sycl::reqd_sub_group_size(SgSize)]] void operator()( sycl::nd_item<1> it) const { int constexpr GroupN = TileN; int constexpr GroupK = SgSize * TileK; diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 454da74825..f4b3424a54 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -545,7 +545,7 @@ void launch_max_pool2d_kernel( if constexpr (is_channels_last) { for (vec_size = std::min(8, memory::can_vectorize_up_to((char*)input)); - vec_size >= 1; + vec_size > 1; vec_size /= 2) { if (numPlane % vec_size != 0) { continue; diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp index 899ffab56c..0a2b5cdbc7 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp @@ -25,8 +25,7 @@ template < int r_args_depth = 1, int res_arg_index = 0> struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - template - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + template void operator()( const int64_t chunk_size, TLA tlAddress, TLW tlWGMeta, @@ -117,7 +116,7 @@ struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { template struct lpnormChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { auto lid = item_id.get_local_linear_id(); auto group_id = item_id.get_group(0); @@ -481,8 +480,7 @@ std::vector foreach_norm_kernel( template struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - template - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + template void operator()( int64_t chunk_size, TLA tlAddressMeta, TLW tlWGMeta, @@ -555,7 +553,7 @@ struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { template struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { auto local_range = item_id.get_local_range(0); auto lid = item_id.get_local_linear_id(); diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 935ab99f74..261c9a0627 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -66,7 +66,7 @@ struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using WelfordOp = WelfordOpsXPU>; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { const int64_t i = item.get_group(0); WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item}; @@ -114,7 +114,7 @@ struct GNRowwiseMomentsVectorizedFunctor WelfordOpsXPU>; using vec_t = memory::aligned_vector; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { WelfordType val[VEC_SIZE]; WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item}; @@ -476,7 +476,7 @@ void group_norm_kernel( template struct Compute1dBackwardFusedParamsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t G = group_; const int64_t D = C_ / G; @@ -630,7 +630,7 @@ template struct GammaBeta1dBackwardLargeKernel : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t c = item.get_group(1) * item.get_local_range(1) + item.get_local_id(1); @@ -890,7 +890,7 @@ template struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { const int64_t nc = item.get_group(0); T_ACC sum1 = 0; @@ -941,7 +941,7 @@ struct ComputeInternalGradientsVectorizedFunctor using vec_t = memory::aligned_vector; using acc_vec_t = memory::aligned_vector; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { acc_vec_t sum1_vec; acc_vec_t sum2_vec; @@ -1038,7 +1038,7 @@ struct ComputeBackwardFusedParamsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { const int64_t G = group_; const int64_t D = C_ / G; @@ -1176,7 +1176,7 @@ template struct GammaBetaBackwardFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { auto group_x = item.get_group(1); auto group_size_x = item.get_local_range(1); diff --git a/src/ATen/native/xpu/sycl/IndexKernelUtils.h b/src/ATen/native/xpu/sycl/IndexKernelUtils.h index 1b1cdc1fdb..4be8b335be 100644 --- a/src/ATen/native/xpu/sycl/IndexKernelUtils.h +++ b/src/ATen/native/xpu/sycl/IndexKernelUtils.h @@ -42,7 +42,7 @@ inline bool fast_gather_kernel_eligible( template struct VectorizedGatherKernel { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item) const { int64_t ind = idx_[item.get_group(1)]; if (allow_neg_indices_) { diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index cedda6a6b7..92a4195d01 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -186,7 +186,7 @@ struct RowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using WelfordType = WelfordData; using WelfordOp = WelfordOps>; - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item_id) const { const int64_t i = item_id.get_group(0); WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false}; @@ -435,7 +435,7 @@ WelfordDataLN compute_stats( template struct VectorizedLayerNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<2> item_id) const { auto i1 = item_id.get_group(1); const T* block_row = X_ + i1 * N_; diff --git a/src/ATen/native/xpu/sycl/LinearInt4.cpp b/src/ATen/native/xpu/sycl/LinearInt4.cpp index 25665b639b..31ed632baf 100644 --- a/src/ATen/native/xpu/sycl/LinearInt4.cpp +++ b/src/ATen/native/xpu/sycl/LinearInt4.cpp @@ -41,7 +41,7 @@ struct LinearInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { ldc(ldc) {} void sycl_ker_config_convention(sycl::handler& cgh) {} - [[intel::reqd_sub_group_size(16)]] void operator()( + [[sycl::reqd_sub_group_size(16)]] void operator()( sycl::nd_item<1> it) const { int constexpr Unroll = 2; int constexpr SgSize = 16; diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp index 8b018de6b8..6b67e081d1 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp @@ -79,7 +79,7 @@ struct NllLoss2dForwardNoReduceKernelFunctor { template struct NllLoss2dForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { scalar_t cur_weight; accscalar_t input_sum = 0; diff --git a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp index 28047972fd..3c5d3271bc 100644 --- a/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp +++ b/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp @@ -51,7 +51,7 @@ void multilabel_margin_loss_shape_check( template struct MultilabelMarginLossForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void + [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void operator()(sycl::nd_item<1> item) const { int k = item.get_group(0); const scalar_t* input_k = input_ + k * dim_; @@ -148,7 +148,7 @@ struct MultilabelMarginLossForwardKernelFunctor template struct MultilabelMarginLossBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void + [[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void operator()(sycl::nd_item<1> item) const { int k = item.get_group(0); const scalar_t* input_k = input_ + k * dim_; diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 6dd893100e..6117b6d261 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -600,7 +600,7 @@ template < class Norm, bool one_moment = false> struct FusedNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<3> item_id) const { accscalar_t sum1 = 0; accscalar_t sum2 = 0; @@ -747,7 +747,7 @@ template < class Norm, bool one_moment = false> struct RowwiseMomentsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<3> item_id) const { index_t local_id = item_id.get_local_id(2); diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 2a4f749e15..fdedf5fb09 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -216,7 +216,7 @@ template < bool is_same_dtype> struct DispatchSoftmaxForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { if (local_size_ == 1 && item.get_global_id(0) >= outer_size_) return; @@ -933,7 +933,7 @@ template < bool is_same_dtype = false> struct DispatchSoftmaxBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(SIMD)]] void operator()( + [[sycl::reqd_sub_group_size(SIMD)]] void operator()( sycl::nd_item<1> item) const { if (local_size_ == 1 && item.get_global_id(0) >= outer_size_) return; diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h index cce01ba4b5..aad93d9eb6 100644 --- a/src/ATen/native/xpu/sycl/SortingKernels.h +++ b/src/ATen/native/xpu/sycl/SortingKernels.h @@ -15,7 +15,7 @@ namespace xpu { template struct SegmentedGroupRadixSortPairsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int seg_idx = item.get_group(0); int seg_offset = seg_idx * num_elements_; @@ -96,7 +96,7 @@ void segmented_group_radix_sort_pairs_kernel( template struct SegmentedRadixSortPairsUpsweepFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) / method_t::PROCESSING_LENGTH; @@ -179,7 +179,7 @@ void segmented_radix_sort_pairs_upsweep_kernel( template struct SegmentedRadixSortPairsScanFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { constexpr int RADIX_BUCKETS = 16; int seg_idx = item.get_group(0); @@ -218,7 +218,7 @@ void segmented_radix_sort_pairs_scan_kernel( template struct SegmentedRadixSortPairsDownsweepFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) / method_t::PROCESSING_LENGTH; @@ -448,7 +448,7 @@ struct SegmentedGroupRadixSelectPairsFunctor MAX_KV_BYTES = std::max(sizeof(key_t), sizeof(value_t)), }; - [[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( + [[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()( sycl::nd_item<1> item) const { int seg_idx = item.get_group(0); int seg_offset = seg_idx * nelements_; diff --git a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp index 7ae95e36b8..dcecf27ab0 100644 --- a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp +++ b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp @@ -231,7 +231,7 @@ inline T reduceGroupWithNThreadLocalReductions( template struct ComputeModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { - [[intel::reqd_sub_group_size(32)]] void operator()( + [[sycl::reqd_sub_group_size(32)]] void operator()( sycl::nd_item<3> item) const { int tidx = item.get_local_id(2); int stidx = item.get_local_range(2) + diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h index af353985ae..ee0d285eaf 100644 --- a/src/comm/DeviceProperties.h +++ b/src/comm/DeviceProperties.h @@ -16,8 +16,13 @@ static int64_t syclMaxWorkGroupSize( auto dev = q.get_device(); auto kid = ::sycl::get_kernel_id(); - auto kbundle = - ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(ctx, {kid}); + // The kernel won't be built for devices except for the first device. + // Launching kernel on devices except for the first device will raise + // runtime error. Here is an alternative as a temporary solution to + // provide an extra hint to SYCL runtime. + // https://github.com/intel/llvm/issues/15127 + auto kbundle = ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>( + ctx, {dev}, {kid}); ::sycl::kernel k = kbundle.get_kernel(kid); return k.get_info<::sycl::info::kernel_device_specific::work_group_size>(dev); diff --git a/src/xccl/IpcExchange.hpp b/src/xccl/IpcExchange.hpp new file mode 100644 index 0000000000..e515cd6ce0 --- /dev/null +++ b/src/xccl/IpcExchange.hpp @@ -0,0 +1,400 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xccl/ze_symbol.hpp" + +#include + +#include +#include +#include +#include +#include + +struct exchange_contents { + // first 4-byte is file descriptor for drmbuf or gem object + union { + ze_ipc_mem_handle_t ipc_handle; + int fd = -1; + }; + size_t offset = 0; + int pid = -1; +}; + +#define sysCheck(x) \ + if (x == -1) { \ + throw std::system_error(std::make_error_code(std::errc(errno))); \ + } + +// We can't inherit it from cmsghdr because flexible array member +struct exchange_fd { + char obscure[CMSG_LEN(sizeof(int)) - sizeof(int)]; + int fd; + + exchange_fd(int cmsg_level, int cmsg_type, int fd) : fd(fd) { + auto* cmsg = reinterpret_cast(obscure); + cmsg->cmsg_len = sizeof(exchange_fd); + cmsg->cmsg_level = cmsg_level; + cmsg->cmsg_type = cmsg_type; + } + + exchange_fd() : fd(-1) { + memset(obscure, 0, sizeof(obscure)); + }; +}; + +void un_send_fd(int sock, int fd, int rank, size_t offset) { + iovec iov[1]; + msghdr msg; + auto rank_offset = std::make_pair(rank, offset); + + iov[0].iov_base = &rank_offset; + iov[0].iov_len = sizeof(rank_offset); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = nullptr; + msg.msg_namelen = 0; + + exchange_fd cmsg(SOL_SOCKET, SCM_RIGHTS, fd); + + msg.msg_control = &cmsg; + msg.msg_controllen = sizeof(exchange_fd); + sysCheck(sendmsg(sock, &msg, 0)); +} + +std::tuple un_recv_fd(int sock) { + iovec iov[1]; + msghdr msg; + std::pair rank_offset; + + iov[0].iov_base = &rank_offset; + iov[0].iov_len = sizeof(rank_offset); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_name = nullptr; + msg.msg_namelen = 0; + + exchange_fd cmsg; + msg.msg_control = &cmsg; + msg.msg_controllen = sizeof(exchange_fd); + int n_recv = recvmsg(sock, &msg, 0); + sysCheck(n_recv); + // assert(n_recv == sizeof(int)); + + return std::make_tuple(cmsg.fd, rank_offset.first, rank_offset.second); +} + +int prepare_socket(const char* sockname) { + sockaddr_un un; + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strcpy(un.sun_path, sockname); + + auto sock = socket(AF_UNIX, SOCK_STREAM, 0); + sysCheck(sock); + + int on = 1; + sysCheck(ioctl(sock, FIONBIO, &on)); + + auto size = offsetof(sockaddr_un, sun_path) + strlen(un.sun_path); + sysCheck(bind(sock, (sockaddr*)&un, size)); + + return sock; +} + +int server_listen(const char* sockname) { + unlink(sockname); + auto sock = prepare_socket(sockname); + sysCheck(listen(sock, 10)); + + return sock; +} + +int serv_accept(int listen_sock) { + sockaddr_un un; + + socklen_t len = sizeof(un); + auto accept_sock = accept(listen_sock, (sockaddr*)&un, &len); + sysCheck(accept_sock); + + return accept_sock; +} + +bool wait_for_socket_file(const char* path, int max_seconds = 10) { + struct stat buffer; + for (int i = 0; i < max_seconds * 10; ++i) { + if (stat(path, &buffer) == 0) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; +} + +int client_connect(const char* server, const char* client) { + if (!wait_for_socket_file(server, 10)) { + std::cerr << "Error: timeout waiting for server socket file: " << server + << std::endl; + exit(EXIT_FAILURE); + } + auto sock = prepare_socket(client); + sockaddr_un sun; + memset(&sun, 0, sizeof(sun)); + sun.sun_family = AF_UNIX; + strcpy(sun.sun_path, server); + auto len = offsetof(sockaddr_un, sun_path) + strlen(server); + const int max_retries = 50; + int retry = 0; + int ret = -1; + while (retry < max_retries) { + ret = connect(sock, (sockaddr*)&sun, len); + if (ret == 0) + break; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + retry++; + } + if (ret != 0) { + perror("connect failed"); + exit(EXIT_FAILURE); + } + + // sysCheck(connect(sock, (sockaddr*)&sun, len)); + return sock; +} + +void un_allgather( + exchange_contents* send_buf, + exchange_contents recv_buf[], + int rank, + int world) { + const char* servername_prefix = "/tmp/open-peer-ipc-mem-server-rank_"; + const char* clientname_prefix = "/tmp/open-peer-ipc-mem-client-rank_"; + char server_name[64]; + /* get username to make server_name unique */ + auto uid = getuid(); + auto pwd = getpwuid(uid); + snprintf( + server_name, + sizeof(server_name), + "%s%d_%s", + servername_prefix, + rank, + pwd->pw_name); + unlink(server_name); + auto s_listen = server_listen(server_name); + + pollfd fdarray[world]; + int recv_socks[world - 1]; + + for (auto& pollfd : fdarray) + pollfd.fd = -1; + std::fill(recv_socks, recv_socks + world - 1, -1); + + auto fd_guard = [&]() { + for (int i = 0, j = 0; i < world; ++i) { + if (i != rank && recv_socks[j] != -1) + sysCheck(close(recv_socks[j++])); + if (fdarray[i].fd != -1) + sysCheck(close(fdarray[i].fd)); + } + }; + + struct guard__ { + using F = decltype(fd_guard); + F f; + guard__(const F& f) : f(f) {} + ~guard__() { + f(); + } + } free_fd(fd_guard); + + // connect to all ranks + for (int i = 0; i < world; ++i) { + if (rank == i) { + fdarray[i].fd = s_listen; + fdarray[i].events = POLLIN; + fdarray[i].revents = 0; + } else { + char peer_name[64]; + char client_name[64]; + + snprintf( + client_name, + sizeof(client_name), + "%s%d-%d_%s", + clientname_prefix, + rank, + i, + pwd->pw_name); + unlink(client_name); + + snprintf( + peer_name, + sizeof(peer_name), + "%s%d_%s", + servername_prefix, + i, + pwd->pw_name); + fdarray[i].fd = client_connect(peer_name, client_name); + fdarray[i].events = POLLOUT; + fdarray[i].revents = 0; + } + } + + // std::future> future_fds[world -1]; + int slot = 0; + uint32_t send_progress = 1 << rank; + + while (slot < world - 1 || send_progress != (1 << world) - 1) { + sysCheck(ppoll(fdarray, world, nullptr, nullptr)); + + for (int i = 0; i < world; ++i) { + if (i == rank && (fdarray[i].revents & POLLIN)) { + // auto accept_sock = serv_accept(fdarray[i].fd); + // future_fds[slot ++] = std::async( + // std::launch::async, [=]() { + // struct sock_guard{ + // int sock; + // sock_guard(int sock) : sock(sock) {} + // ~guard_sock() {sysCheck(close(sock));} + // } release(accept_sock); + // auto ret = un_recv_fd(accept_sock); + // return ret;}); + recv_socks[slot++] = serv_accept(fdarray[i].fd); + } else if ( + (send_progress & (1 << i)) == 0 && fdarray[i].revents & POLLOUT) { + un_send_fd(fdarray[i].fd, send_buf->fd, rank, send_buf->offset); + send_progress |= 1 << i; + } + } + } + + for (int i = 0; i < world - 1; ++i) { + // future_fds[i].wait(); + // auto [fd, peer, offset] = future_fds[i].get(); + auto [fd, peer, offset] = un_recv_fd(recv_socks[i]); + recv_buf[peer].fd = fd; + recv_buf[peer].offset = offset; + } + + recv_buf[rank] = *send_buf; +} + +class IpcChannel { + public: + IpcChannel() { + initialized = false; + } + void init(sycl::queue& queue, uint32_t rank_in, uint32_t world_in) { + if (initialized) + return; + + if (!load_level_zero_library()) { + throw std::runtime_error("Failed to initialize Level Zero"); + } + + zeCheck_dynamic(zeInit_dynamic(0)); + int tmp_rank, tmp_world; + + tmp_world = world_in; + tmp_rank = rank_in; + + rank = tmp_rank; + world = tmp_world; + initialized = true; + } + void release(sycl::queue& queue) { + if (!initialized) + return; + try { + auto l0_ctx = sycl::get_native( + queue.get_context()); + for (int i = 0; i < world; i++) { + if (i != rank) { + zeCheck_dynamic(zeMemCloseIpcHandle_dynamic( + l0_ctx, (char*)buffers[i] - offsets[i])); + } + } + } catch (const std::exception& e) { + std::cerr << "Warning: Level Zero cleanup failed: " << e.what() + << std::endl; + } + sycl::free(buffers[rank], queue); + initialized = false; + } + + // buffer_size as element size + void exchange_peer_ipc_mem( + sycl::queue& queue, + void* ptr, + uint32_t rank_in, + uint32_t world_in) { + if (!initialized) + init(queue, rank_in, world_in); + if (!load_level_zero_library()) { + throw std::runtime_error("Level Zero not available"); + } + + // Step 1: Get base address of the pointer + sycl::context ctx = queue.get_context(); + auto l0_ctx = sycl::get_native(ctx); + + void* base_addr; + size_t base_size; + zeCheck_dynamic( + zeMemGetAddressRange_dynamic(l0_ctx, ptr, &base_addr, &base_size)); + + // Step 2: Get IPC mem handle from base address + alignas(64) exchange_contents send_buf; + alignas(64) exchange_contents recv_buf[world]; + + // fill in the exchange info + zeCheck_dynamic( + zeMemGetIpcHandle_dynamic(l0_ctx, base_addr, &send_buf.ipc_handle)); + send_buf.offset = (char*)ptr - (char*)base_addr; + + send_buf.pid = getpid(); + + // Step 3: Exchange the handles and offsets + memset(recv_buf, 0, sizeof(recv_buf)); + // Overkill if we don't really needs all peer's handles + un_allgather(&send_buf, recv_buf, rank, world); + for (uint32_t i = 0; i < world; i++) { + // Step 4: Prepare pid file descriptor of next process + auto* peer = recv_buf + i; + // Step 6: Open IPC handle of remote peer + auto l0_device = sycl::get_native( + queue.get_device()); + void* peer_base; + + zeCheck_dynamic(zeMemOpenIpcHandle_dynamic( + l0_ctx, + l0_device, + peer->ipc_handle, + ZE_IPC_MEMORY_FLAG_BIAS_CACHED, + &peer_base)); + + buffers[i] = (char*)peer_base + peer->offset; + offsets[i] = peer->offset; + ipc_handle[i] = send_buf.ipc_handle; + } + } + + bool initialized; + static constexpr uint32_t max_rank = 16; + void* buffers[max_rank]; + void* sync_buffer[max_rank]; + size_t offsets[max_rank]; + ze_ipc_mem_handle_t ipc_handle[max_rank]; + int rank, world; + int size_per_buffer; + int data_size_per_buffer; + int buffer_index; +}; diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 9f72cd1bd3..8a8be26659 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -283,6 +283,11 @@ bool ProcessGroupXCCL::WorkXCCL::isCompleted() { void ProcessGroupXCCL::WorkXCCL::synchronize() { synchronizeStream(); + if (c10d::allow_inflight_collective_as_graph_input()) { + c10d::unregister_work( + c10::intrusive_ptr< + ProcessGroupXCCL::WorkXCCL>::unsafe_reclaim_from_nonowning(this)); + } } void ProcessGroupXCCL::WorkXCCL::synchronizeStream() { @@ -427,17 +432,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus( pgStatus_->lastEnqueuedNumelOut = work->numelOut_; } -void ProcessGroupXCCL::setCompletedPgStatus( - c10::intrusive_ptr work) { - pgStatus_->lastCompletedSeq = static_cast(work->getSequencenumber()); - pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_); - pgStatus_->lastCompletedNumelIn = work->numelIn_; - pgStatus_->lastCompletedNumelOut = work->numelOut_; - // To avoid complexity, we're not computing duration. - FlightRecorderXCCL::get()->retire_id( - work->trace_id_, /*compute_duration*/ false); -} - void ProcessGroupXCCL::setSequenceNumberForGroup() {} uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() { @@ -767,8 +761,12 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); + auto id = work->trace_id_; work->future_->addCallback( - [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); }); + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->blockingWait_ = blockingWait_; work->numelIn_ = 0; @@ -879,9 +877,12 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); - work->future_->addCallback([this, work](at::ivalue::Future&) { - this->setCompletedPgStatus(work); - }); + auto id = work->trace_id_; + work->future_->addCallback( + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->numelIn_ = work->numelOut_ = tensor.numel(); setEnqueuedPgStatus(work); diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index e7aa39c82d..1d3241e4d4 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -415,8 +415,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { const std::vector& groupRanks() const; void setEnqueuedPgStatus(c10::intrusive_ptr work); - void setCompletedPgStatus( - c10::intrusive_ptr work); bool dumpDebuggingInfo(bool includeStackTrace = true); protected: diff --git a/src/xccl/XPUSymmetricMemory.cpp b/src/xccl/XPUSymmetricMemory.cpp new file mode 100644 index 0000000000..d49d126122 --- /dev/null +++ b/src/xccl/XPUSymmetricMemory.cpp @@ -0,0 +1,460 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace c10d { +namespace symmetric_memory { + +static StoreExchange storeExchange = StoreExchange("XPUSymmetricMemory"); + +AllocationRef::AllocationRef( + void* ptr, + HandleType handle, + size_t block_size, + int device_idx, + bool local_allocation) + : ptr(ptr), + handle(handle), + block_size(block_size), + device_idx(device_idx), + local_allocation(local_allocation){} + +AllocationRef::~AllocationRef() { + if (is_finalizing()) { + return; + } + // Currently, we cannot free virtual memory exchanged from other device. + if (!local_allocation) { + return; + } + c10::Device local_device(c10::DeviceType::XPU, device_idx); + c10::DeviceGuard guard(local_device); + c10::xpu::syncStreamsOnDevice(); + auto stream = at::xpu::getCurrentXPUStream(); + sycl::free(ptr, stream); +} + +XPUSymmetricMemory::XPUSymmetricMemory( + std::vector> alloc_refs, + std::vector buffers, + std::vector signal_pads, + HandleType mc_handle, + void* mc_addr, + size_t buffer_size, + int local_device_idx, + int rank, + int world_size) + : alloc_refs_(std::move(alloc_refs)), + buffers_(std::move(buffers)), + signal_pads_(std::move(signal_pads)), + mc_handle_(mc_handle), + mc_addr_(mc_addr), + buffer_size_(buffer_size), + local_device_idx_(local_device_idx), + rank_(rank), + world_size_(world_size) { + const size_t arr_size = sizeof(void*) * world_size_; + buffers_dev_ = reinterpret_cast( + c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); + signal_pads_dev_ = reinterpret_cast( + c10::xpu::XPUCachingAllocator::raw_alloc(arr_size)); + + c10::Device local_device(c10::DeviceType::XPU, local_device_idx); + c10::DeviceGuard guard(local_device); + + at::xpu::getCurrentXPUStream().queue().memcpy( + buffers_dev_, buffers_.data(), arr_size); + at::xpu::getCurrentXPUStream().queue().memcpy( + signal_pads_dev_, signal_pads_.data(), arr_size); +} + +std::vector XPUSymmetricMemory::get_buffer_ptrs() { + return buffers_; +} + +std::vector XPUSymmetricMemory::get_signal_pad_ptrs() { + return signal_pads_; +} + +void** XPUSymmetricMemory::get_buffer_ptrs_dev() { + return buffers_dev_; +} + +void** XPUSymmetricMemory::get_signal_pad_ptrs_dev() { + return signal_pads_dev_; +} + +size_t XPUSymmetricMemory::get_buffer_size() { + return buffer_size_; +} + +size_t XPUSymmetricMemory::get_signal_pad_size() { + return signal_pad_size; +} + +bool XPUSymmetricMemory::has_multicast_support() { + return false; +} + +void* XPUSymmetricMemory::get_multicast_ptr() { + return nullptr; +} + +at::Tensor XPUSymmetricMemory::get_buffer( + int rank, + c10::IntArrayRef sizes, + c10::ScalarType dtype, + int64_t storage_offset) { + const size_t numel = std::accumulate( + sizes.begin(), + sizes.end(), + static_cast(1), + std::multiplies()); + const auto element_size = c10::elementSize(dtype); + const auto req_size = (numel + storage_offset) * element_size; + TORCH_CHECK( + req_size <= buffer_size_, + "XPUSymmetricMemory::get_buffer: the requested size (", + req_size, + " bytes) exceeds the allocated size (", + buffer_size_, + " bytes)"); + auto data_ptr = reinterpret_cast(buffers_[rank]) + + storage_offset * element_size; + // check the device of this device buffer + auto ptr_to_device_id = c10::xpu::get_device_idx_from_pointer(data_ptr); + auto device = c10::Device(c10::DeviceType::XPU, ptr_to_device_id); + auto options = at::TensorOptions().dtype(dtype).device(device); + + return at::for_blob(data_ptr, sizes) + .options(options) + .target_device(device) + .make_tensor(); +} + +void check_channel(int channel, int world_size) { + TORCH_CHECK( + channel >= 0, + "channel for barrier(), put_signal() and wait_signal() ", + "must be greater than 0 (got ", + channel, + ")"); + const size_t num_channels = signal_pad_size / sizeof(uint32_t) * world_size; + TORCH_CHECK( + static_cast(channel) < num_channels, + "The maximum supported channel for barrier(), put_signal() and wait_signal() is ", + num_channels - 1, + " (got ", + channel, + ")"); +} + +void XPUSymmetricMemory::barrier(int channel, size_t timeout_ms) { + check_channel(channel, world_size_); + + // Currently, we leverage oneCCL for barrier. Later, we may move to SYCL + // implementation. + auto group = c10d::resolve_process_group(group_name_); + if (group == nullptr) { + TORCH_WARN( + "Process group '", + group_name_, + "' not found, please init process group first before calling SymmetricMemory"); + throw std::runtime_error("Process group not found"); + } + auto* xcclPg = dynamic_cast( + group->getBackend(c10::DeviceType::XPU).get()); + + c10::Device local_device(c10::DeviceType::XPU, local_device_idx_); + c10::DeviceGuard guard(local_device); + + static thread_local at::Tensor barrier_tensor; + if (!barrier_tensor.defined() || barrier_tensor.device() != local_device) { + barrier_tensor = at::zeros( + {1}, at::TensorOptions().device(local_device).dtype(at::kFloat)); + } else { + barrier_tensor.zero_(); + } + + c10d::AllreduceOptions arOpts; + arOpts.asyncOp = false; + auto work = + xcclPg->allreduce_impl(barrier_tensor, "xccl:symm_mem_barrier", arOpts); + + if (work) { + bool success = work->wait(std::chrono::milliseconds(timeout_ms)); + TORCH_CHECK( + success, + "Barrier timeout after ", + timeout_ms, + " ms for group '", + group_name_, + "'"); + } +} + +void XPUSymmetricMemory::put_signal( + int dst_rank, + int channel, + size_t timeout_ms) { + LOG(ERROR) << "XPUSymmetricMemory::put_signal not supported"; +} + +void XPUSymmetricMemory::wait_signal( + int src_rank, + int channel, + size_t timeout_ms) { + LOG(ERROR) << "XPUSymmetricMemory::wait_signal not supported"; +} + +int XPUSymmetricMemory::get_rank() { + return rank_; +} + +int XPUSymmetricMemory::get_world_size() { + return world_size_; +} + +c10::Device XPUSymmetricMemory::get_device() { + return c10::Device(c10::DeviceType::XPU, local_device_idx_); +} + +Block::Block( + c10::intrusive_ptr alloc_ref, + int device_idx, + size_t block_size, + size_t buffer_size, + size_t signal_pad_offset, + const std::optional& group_name) + : alloc_ref(std::move(alloc_ref)), + device_idx(device_idx), + block_size(block_size), + buffer_size(buffer_size), + signal_pad_offset(signal_pad_offset), + default_group_name(std::move(group_name)) {} + +void* XPUSymmetricMemoryAllocator::alloc( + size_t size, + int device_idx, + const std::optional& group_name) { + size_t signal_pad_offset = at::round_up(size, 16UL); + size_t block_size = signal_pad_offset + signal_pad_size; + + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + void* ptr = sycl::malloc_device(block_size, current_queue); + current_queue.memset(ptr, 0, block_size); + auto alloc_ref = + c10::make_intrusive(ptr, ptr, block_size, device_idx, true); + auto block = c10::make_intrusive( + std::move(alloc_ref), + device_idx, + block_size, + size, + signal_pad_offset, + group_name); + + { + std::unique_lock lock(mutex_); + ptr_to_block_.emplace(ptr, std::move(block)); + } + return ptr; +} + +void XPUSymmetricMemoryAllocator::free(void* ptr) { + std::unique_lock lock(mutex_); + ptr_to_block_.erase(ptr); +} + +size_t XPUSymmetricMemoryAllocator::get_alloc_size(void* ptr) { + auto block = find_block(ptr); + TORCH_CHECK( + block != nullptr, + "XPUSymmetricMemoryAllocator::get_alloc_size: input must be allocated ", + "via XPUSymmetricMemoryAllocator::alloc"); + return block->buffer_size; +} + +struct RendezvousRequest { + int device_idx; + int pid; + size_t block_size; + size_t buffer_size; + size_t signal_pad_offset; + bool has_multicast_support; +}; + +void validate_rendezvous_requests( + const std::vector& reqs, + int world_size) { + TORCH_CHECK(reqs.size() == (size_t)world_size); + + std::unordered_set device_indices; + device_indices.reserve(world_size); + for (auto req : reqs) { + device_indices.insert(req.device_idx); + } + + for (int r = 1; r < world_size; ++r) { + TORCH_CHECK(reqs[r].block_size == reqs[0].block_size); + TORCH_CHECK(reqs[r].buffer_size == reqs[0].buffer_size); + TORCH_CHECK(reqs[r].signal_pad_offset == reqs[0].signal_pad_offset); + } +} + +c10::intrusive_ptr XPUSymmetricMemoryAllocator::rendezvous( + void* ptr, + const std::optional& group_name) { + auto block = find_block(ptr); + if (block == nullptr) { + return nullptr; + } + + // The group_name passed to rendezvous() takes precedence over + // the default group_name specified during allocation. + std::string group_name_; + // Treat empty string and std::nullopt the same as empty string seems to be + // implicitly used that way + if (group_name.has_value() && group_name != "") { + group_name_ = *group_name; + } else { + if (!block->default_group_name.has_value()) { + TORCH_CHECK( + false, + "XPUSymmetricMemory::rendezvous: `group_name` is neither " + "specified during allocation nor passed to rendezvous()."); + } + group_name_ = *block->default_group_name; + } + + auto it = block->symm_mems.find(group_name_); + if (it != block->symm_mems.end()) { + return it->second; + } + + c10::Device local_device(c10::DeviceType::XPU, block->device_idx); + c10::DeviceGuard guard(local_device); + + // IpcChannel is used to do inter-process communication + IpcChannel ipc_channel; + auto group_info = get_group_info(group_name_); + auto store = group_info.store; + int rank = group_info.rank; + int world_size = group_info.world_size; + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + + auto local_req = RendezvousRequest{ + .device_idx = block->device_idx, + .pid = getpid(), + .block_size = block->block_size, + .buffer_size = block->buffer_size, + .signal_pad_offset = block->signal_pad_offset, + .has_multicast_support = false}; + auto reqs = storeExchange.all_gather(store, rank, world_size, local_req); + validate_rendezvous_requests(reqs, world_size); + + std::vector pids(world_size); + for (int r = 0; r < world_size; ++r) { + pids[r] = reqs[r].pid; + } + + // do IPC exchange for all peer ranks + ipc_channel.exchange_peer_ipc_mem(current_queue, ptr, rank, world_size); + + // no physical memory handle, so handles and buffers are both for virtual + // address + std::vector handles(world_size); + std::vector buffers(world_size, nullptr); + std::vector signal_pads(world_size, nullptr); + + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + handles[r] = block->alloc_ref->handle; + buffers[r] = ptr; + signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset); + continue; + } else { + buffers[r] = ipc_channel.buffers[r]; + handles[r] = ipc_channel.buffers[r]; + signal_pads[r] = + (void*)((uintptr_t)buffers[r] + block->signal_pad_offset); + } + } + storeExchange.barrier(store, rank, world_size); + + HandleType mc_handle{}; + void* mc_addr = nullptr; + + std::vector> alloc_refs; + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + alloc_refs.emplace_back(block->alloc_ref); + continue; + } + alloc_refs.push_back(c10::make_intrusive( + buffers[r], handles[r], block->block_size, block->device_idx, false)); + } + + auto symm_mem = c10::make_intrusive( + std::move(alloc_refs), + std::move(buffers), + std::move(signal_pads), + mc_handle, + mc_addr, + block->buffer_size, + block->device_idx, + group_info.rank, + group_info.world_size); + symm_mem->set_group_name(group_name_); + block->symm_mems[group_name_] = symm_mem; + return symm_mem; +} + +bool XPUSymmetricMemoryAllocator::has_multicast_support(int device_idx) { + return false; +} + +c10::DeviceType XPUSymmetricMemoryAllocator::supported_device_type() { + return c10::DeviceType::XPU; +} + +std::string XPUSymmetricMemoryAllocator::name() { + return "XPU"; +} + +c10::intrusive_ptr XPUSymmetricMemoryAllocator::find_block(void* ptr) { + std::shared_lock lock(mutex_); + auto it = ptr_to_block_.find(ptr); + if (it == ptr_to_block_.end()) { + return nullptr; + } + return it->second; +} + +struct RegisterXPUSymmetricMemoryAllocator { + RegisterXPUSymmetricMemoryAllocator() { + auto allocator = c10::make_intrusive(); + // Query backend used for XPU + if (getSymmMemBackendXPU() == "XPU") { + // Direct set (static registration) + register_allocator(c10::DeviceType::XPU, allocator); + } else { + // Register availability in case `set_backend` is called dynamically + register_availability("XPU", allocator); + } + } +}; +static RegisterXPUSymmetricMemoryAllocator register_allocator_; + +} // namespace symmetric_memory +} // namespace c10d diff --git a/src/xccl/XPUSymmetricMemory.hpp b/src/xccl/XPUSymmetricMemory.hpp new file mode 100644 index 0000000000..2daac1114a --- /dev/null +++ b/src/xccl/XPUSymmetricMemory.hpp @@ -0,0 +1,130 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace c10d::symmetric_memory { + +// Resource wrapper that owns a (vaddr, allocation handle) pair. Upon +// destruction, it unmaps the vaddr and releases the allocation handle. +struct AllocationRef : public c10::intrusive_ptr_target { + void* ptr; + HandleType handle; + size_t block_size; + int device_idx; + bool local_allocation; + + AllocationRef( + void* ptr, + HandleType handle, + size_t block_size, + int device_idx, + bool local_allocation); + + ~AllocationRef(); +}; + +class XPUSymmetricMemory : public SymmetricMemory { + public: + XPUSymmetricMemory( + std::vector> alloc_refs, + std::vector buffers, + std::vector signal_pads, + HandleType mc_handle, + void* mc_addr, + size_t buffer_size, + int local_device_idx, + int rank, + int world_size); + + ~XPUSymmetricMemory() override{}; + + std::vector get_buffer_ptrs() override; + std::vector get_signal_pad_ptrs() override; + void** get_buffer_ptrs_dev() override; + void** get_signal_pad_ptrs_dev() override; + size_t get_buffer_size() override; + size_t get_signal_pad_size() override; + + bool has_multicast_support() override; + void* get_multicast_ptr() override; + + at::Tensor get_buffer( + int rank, + c10::IntArrayRef sizes, + c10::ScalarType dtype, + int64_t storage_offset); + + void barrier(int channel, size_t timeout_ms) override; + void put_signal(int dst_rank, int channel, size_t timeout_ms) override; + void wait_signal(int src_rank, int channel, size_t timeout_ms) override; + + int get_rank() override; + int get_world_size() override; + c10::Device get_device() override; + + void set_group_name(const std::string& group_name) { + group_name_ = group_name; + } + + private: + std::vector> alloc_refs_; + std::vector buffers_; + std::vector signal_pads_; + HandleType mc_handle_; + void* mc_addr_; + size_t buffer_size_; + int local_device_idx_; + int rank_; + int world_size_; + void** buffers_dev_; + void** signal_pads_dev_; + std::string group_name_; +}; + +struct Block : public c10::intrusive_ptr_target { + c10::intrusive_ptr alloc_ref; + int device_idx; + size_t block_size; + size_t buffer_size; + size_t signal_pad_offset; + std::optional default_group_name; + std::map> symm_mems; + + Block( + c10::intrusive_ptr alloc_ref, + int device_idx, + size_t block_size, + size_t buffer_size, + size_t signal_pad_offset, + const std::optional& group_name); +}; + +class XPUSymmetricMemoryAllocator : public SymmetricMemoryAllocator { + public: + void* alloc( + size_t size, + int device_idx, + const std::optional& group_name) override; + + void free(void* ptr) override; + size_t get_alloc_size(void* ptr) override; + c10::intrusive_ptr rendezvous( + void* ptr, + const std::optional& group_name) override; + bool has_multicast_support(int device_idx) override; + // void exchange_peer_ipc_mem(sycl::queue& queue, void* ptr); + c10::DeviceType supported_device_type() override; + std::string name() override; + + private: + c10::intrusive_ptr find_block(void* ptr); + + std::shared_mutex mutex_; + std::unordered_map> ptr_to_block_; +}; + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryTypes.hpp b/src/xccl/XPUSymmetricMemoryTypes.hpp new file mode 100644 index 0000000000..4cab3b81f7 --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryTypes.hpp @@ -0,0 +1,8 @@ +#pragma once + +namespace c10d::symmetric_memory { + +constexpr size_t signal_pad_size = 2048; +using HandleType = void*; + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.cpp b/src/xccl/XPUSymmetricMemoryUtils.cpp new file mode 100644 index 0000000000..7130fe7b6a --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryUtils.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace c10d::symmetric_memory { + +std::string getSymmMemBackendXPU() { + static auto val = c10::utils::get_env("TORCH_SYMMMEM"); + if (val.has_value()) { + TORCH_CHECK( + val.value() == "XPU", + "TORCH_SYMMMEM environment variable must be 'XPU'."); + return val.value(); + } + return "XPU"; +} + +bool device_has_multicast_support(int device_idx) { + return false; +} + +bool allow_overlapping_devices() { + return false; +} + +void map_block( + void** ptr, + ze_physical_mem_handle_t handle, + size_t size, + int device_idx) { + sycl::queue current_queue = at::xpu::getCurrentXPUStream().queue(); + sycl::context sycl_ctx = current_queue.get_context(); + ze_context_handle_t ze_context = + sycl::get_native(sycl_ctx); + // 1. Reserve virtual address space + void* virtual_ptr = nullptr; + ze_result_t status = zeVirtualMemReserve( + ze_context, // context + nullptr, // let L0 pick virtual address + size, // size + &virtual_ptr // out: reserved address + ); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemReserve failed"); + + // 2. Map physical memory to virtual address + status = zeVirtualMemMap( + ze_context, + virtual_ptr, // virtual memory to map to + size, + handle, // physical memory handle + 0, // flags + ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE // ze_memory_access_attribute_t + ); + TORCH_CHECK(status == ZE_RESULT_SUCCESS, "zeVirtualMemMap failed"); + + // 3. Set access attributes + ze_memory_access_attribute_t access = ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE; + status = + zeVirtualMemSetAccessAttribute(ze_context, virtual_ptr, size, access); + TORCH_CHECK( + status == ZE_RESULT_SUCCESS, "zeVirtualMemSetAccessAttribute failed"); + + // 4. Return pointer + *ptr = virtual_ptr; +} + +} // namespace c10d::symmetric_memory diff --git a/src/xccl/XPUSymmetricMemoryUtils.hpp b/src/xccl/XPUSymmetricMemoryUtils.hpp new file mode 100644 index 0000000000..69189f45cf --- /dev/null +++ b/src/xccl/XPUSymmetricMemoryUtils.hpp @@ -0,0 +1,89 @@ +#pragma once +#include +#include +#include + +namespace c10d { +namespace symmetric_memory { + +std::string getSymmMemBackendXPU(); + +bool device_has_multicast_support(int device_idx); + +bool allow_overlapping_devices(); + +// A set of store-based exchange methods with a preset prefix typically type of +// the SymmetricMemory. Most used as static instances at respective +// SymmetricMemory implementation files. +class StoreExchange { + public: + StoreExchange(const std::string& store_prefix) + : store_prefix_(store_prefix) {} + + // Put template function in header file so that compiler can easily access it. + template + std::vector all_gather( + const c10::intrusive_ptr& store, + int rank, + int world_size, + T val) { + static_assert(std::is_trivially_copyable_v); + + std::vector peer_keys; + peer_keys.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + std::ostringstream oss; + oss << store_prefix_ << "/" << seq_id_ << "/" << r; + peer_keys.push_back(oss.str()); + } + ++seq_id_; + + { + std::vector payload( + reinterpret_cast(&val), + reinterpret_cast(&val) + sizeof(T)); + store->set(peer_keys[rank], payload); + } + + std::vector peer_vals; + peer_vals.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + if (r == rank) { + peer_vals.push_back(val); + continue; + } + store->wait({peer_keys[r]}); + auto payload = store->get(peer_keys[r]); + TORCH_CHECK(payload.size() == sizeof(T)); + T peer_val{}; + std::memcpy(&peer_val, payload.data(), sizeof(T)); + peer_vals.push_back(peer_val); + } + return peer_vals; + } + + void barrier( + const c10::intrusive_ptr& store, + int rank, + int world_size) { + // TODO: implement an efficient one? + all_gather(store, rank, world_size, 0); + } + + private: + const std::string store_prefix_; + size_t seq_id_ = 0; +}; + +// Returns a pointer of virtual address that is mapped to the physical memory +// held by the handle. +// todo: will follow such physical memory handle map with virtual address, +// when L0 provides physical handle exchange API and we have multicast support. +void map_block( + void** ptr, + ze_physical_mem_handle_t handle, + size_t size, + int device_idx); + +} // namespace symmetric_memory +} // namespace c10d diff --git a/src/xccl/ze_symbol.hpp b/src/xccl/ze_symbol.hpp new file mode 100644 index 0000000000..20af666811 --- /dev/null +++ b/src/xccl/ze_symbol.hpp @@ -0,0 +1,254 @@ +#pragma once + +#include +#include +#include +#include + +#define zeVirtualMemMap zeVirtualMemMap_original +#define zeVirtualMemReserve zeVirtualMemReserve_original +#define zeVirtualMemSetAccessAttribute zeVirtualMemSetAccessAttribute_original + +#include + +#undef zeVirtualMemMap +#undef zeVirtualMemReserve +#undef zeVirtualMemSetAccessAttribute + +typedef ze_result_t (*zeInit_t)(ze_init_flags_t flags); +typedef ze_result_t (*zeMemGetAddressRange_t)( + ze_context_handle_t hContext, + const void* ptr, + void** pBase, + size_t* pSize); +typedef ze_result_t (*zeMemGetIpcHandle_t)( + ze_context_handle_t hContext, + const void* ptr, + ze_ipc_mem_handle_t* pIpcHandle); +typedef ze_result_t (*zeMemOpenIpcHandle_t)( + ze_context_handle_t hContext, + ze_device_handle_t hDevice, + ze_ipc_mem_handle_t handle, + ze_ipc_memory_flags_t flags, + void** pptr); +typedef ze_result_t ( + *zeMemCloseIpcHandle_t)(ze_context_handle_t hContext, const void* ptr); +typedef ze_result_t (*zeVirtualMemMap_t)( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_physical_mem_handle_t hPhysicalMemory, + size_t offset, + ze_memory_access_attribute_t access); +typedef ze_result_t (*zeVirtualMemReserve_t)( + ze_context_handle_t hContext, + const void* pStart, + size_t size, + void** pptr); +typedef ze_result_t (*zeVirtualMemSetAccessAttribute_t)( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_memory_access_attribute_t access); + +bool load_level_zero_library(); +void unload_level_zero_library(); + +#define zeCheck_dynamic(x) \ + do { \ + if (!load_level_zero_library()) { \ + throw std::runtime_error("Level Zero library not available"); \ + } \ + ze_result_t result = (x); \ + if (result != ZE_RESULT_SUCCESS) { \ + auto e = zeException(result); \ + std::cout << "Throw " << e.what() << std::endl; \ + throw e; \ + } \ + } while (0) + +#define zeInit_dynamic(flags) zeInit_ptr(flags) +#define zeMemGetAddressRange_dynamic(ctx, ptr, base, size) \ + zeMemGetAddressRange_ptr(ctx, ptr, base, size) +#define zeMemGetIpcHandle_dynamic(ctx, ptr, handle) \ + zeMemGetIpcHandle_ptr(ctx, ptr, handle) +#define zeMemOpenIpcHandle_dynamic(ctx, dev, handle, flags, ptr) \ + zeMemOpenIpcHandle_ptr(ctx, dev, handle, flags, ptr) +#define zeMemCloseIpcHandle_dynamic(ctx, ptr) zeMemCloseIpcHandle_ptr(ctx, ptr) +#define zeVirtualMemMap_dynamic(ctx, ptr, size, phys_mem, offset, access) \ + zeVirtualMemMap_ptr(ctx, ptr, size, phys_mem, offset, access) +#define zeVirtualMemReserve_dynamic(ctx, start, size, ptr) \ + zeVirtualMemReserve_ptr(ctx, start, size, ptr) +#define zeVirtualMemSetAccessAttribute_dynamic(ctx, ptr, size, access) \ + zeVirtualMemSetAccessAttribute_ptr(ctx, ptr, size, access) + +// Exception handling class +class zeException : std::exception { + const char* zeResultToString(ze_result_t status) const { + static const std::unordered_map zeResultToStringMap{ + {ZE_RESULT_SUCCESS, "[Core] success"}, + {ZE_RESULT_NOT_READY, "[Core] synchronization primitive not signaled"}, + {ZE_RESULT_ERROR_UNINITIALIZED, + "[Validation] driver is not initialized"}, + {ZE_RESULT_ERROR_INVALID_NULL_POINTER, + "[Validation] pointer argument may not be nullptr"}, + {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, + "[Validation] handle argument is not valid"}, + {ZE_RESULT_ERROR_INVALID_ENUMERATION, + "[Validation] enumerator argument is not valid"}, + {ZE_RESULT_ERROR_INVALID_SIZE, "[Validation] size argument is invalid"}, + {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, + "[Validation] size argument is not supported by the device"}, + {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, + "[Validation] alignment argument is not supported by the device"}, + {ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, + "[Validation] generic error code for unsupported features"}, + {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, + "[Validation] native binary is not supported by the device"}, + {ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, + "[Core] insufficient host memory to satisfy call"}, + {ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, + "[Core] insufficient device memory to satisfy call"}, + {ZE_RESULT_ERROR_DEVICE_LOST, + "[Core] device hung, reset, was removed, or driver update occurred"}, + {ZE_RESULT_ERROR_MODULE_BUILD_FAILURE, + "[Core] error occurred when building module, see build log for details"}, + {ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, + "[Validation] object pointed to by handle still in-use by device"}, + }; + auto it = zeResultToStringMap.find(status); + if (it != zeResultToStringMap.end()) + return it->second; + else + return "Unknown Reason"; + } + + public: + zeException(ze_result_t ret) : result_(ret) {} + + ze_result_t result_; + + const char* what() const noexcept override { + return zeResultToString(result_); + } +}; + +#define zeCheck(x) \ + if (x != ZE_RESULT_SUCCESS) { \ + auto e = zeException(x); \ + std::cout << "Throw " << e.what() << std::endl; \ + throw e; \ + } + +static zeInit_t zeInit_ptr = nullptr; +static zeMemGetAddressRange_t zeMemGetAddressRange_ptr = nullptr; +static zeMemGetIpcHandle_t zeMemGetIpcHandle_ptr = nullptr; +static zeMemOpenIpcHandle_t zeMemOpenIpcHandle_ptr = nullptr; +static zeMemCloseIpcHandle_t zeMemCloseIpcHandle_ptr = nullptr; +static zeVirtualMemMap_t zeVirtualMemMap_ptr = nullptr; +static zeVirtualMemReserve_t zeVirtualMemReserve_ptr = nullptr; +static zeVirtualMemSetAccessAttribute_t zeVirtualMemSetAccessAttribute_ptr = + nullptr; + +static void* ze_handle = nullptr; + +inline bool load_level_zero_library() { + if (ze_handle != nullptr) { + return true; + } + const char* lib_names[] = {"libze_loader.so"}; + + for (const char* lib_name : lib_names) { + ze_handle = dlopen(lib_name, RTLD_LAZY); + if (ze_handle != nullptr) { + break; + } + } + + if (ze_handle == nullptr) { + std::cerr << "Failed to load Level Zero library: " << dlerror() + << std::endl; + return false; + } + + zeInit_ptr = (zeInit_t)dlsym(ze_handle, "zeInit"); + zeMemGetAddressRange_ptr = + (zeMemGetAddressRange_t)dlsym(ze_handle, "zeMemGetAddressRange"); + zeMemGetIpcHandle_ptr = + (zeMemGetIpcHandle_t)dlsym(ze_handle, "zeMemGetIpcHandle"); + zeMemOpenIpcHandle_ptr = + (zeMemOpenIpcHandle_t)dlsym(ze_handle, "zeMemOpenIpcHandle"); + zeMemCloseIpcHandle_ptr = + (zeMemCloseIpcHandle_t)dlsym(ze_handle, "zeMemCloseIpcHandle"); + zeVirtualMemMap_ptr = (zeVirtualMemMap_t)dlsym(ze_handle, "zeVirtualMemMap"); + zeVirtualMemReserve_ptr = + (zeVirtualMemReserve_t)dlsym(ze_handle, "zeVirtualMemReserve"); + zeVirtualMemSetAccessAttribute_ptr = (zeVirtualMemSetAccessAttribute_t)dlsym( + ze_handle, "zeVirtualMemSetAccessAttribute"); + + if (!zeInit_ptr || !zeMemGetAddressRange_ptr || !zeMemGetIpcHandle_ptr || + !zeMemOpenIpcHandle_ptr || !zeMemCloseIpcHandle_ptr || + !zeVirtualMemMap_ptr || !zeVirtualMemReserve_ptr || + !zeVirtualMemSetAccessAttribute_ptr) { + std::cerr << "Failed to load Level Zero API functions" << std::endl; + dlclose(ze_handle); + ze_handle = nullptr; + return false; + } + + return true; +} + +inline void unload_level_zero_library() { + if (ze_handle != nullptr) { + dlclose(ze_handle); + ze_handle = nullptr; + zeInit_ptr = nullptr; + zeMemGetAddressRange_ptr = nullptr; + zeMemGetIpcHandle_ptr = nullptr; + zeMemOpenIpcHandle_ptr = nullptr; + zeMemCloseIpcHandle_ptr = nullptr; + zeVirtualMemMap_ptr = nullptr; + zeVirtualMemReserve_ptr = nullptr; + zeVirtualMemSetAccessAttribute_ptr = nullptr; + } +} + +extern "C" { + +__attribute__((weak)) ze_result_t zeVirtualMemMap( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_physical_mem_handle_t hPhysicalMemory, + size_t offset, + ze_memory_access_attribute_t access) { + if (!load_level_zero_library() || !zeVirtualMemMap_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; + } + return zeVirtualMemMap_ptr( + hContext, ptr, size, hPhysicalMemory, offset, access); +} + +__attribute__((weak)) ze_result_t zeVirtualMemReserve( + ze_context_handle_t hContext, + const void* pStart, + size_t size, + void** pptr) { + if (!load_level_zero_library() || !zeVirtualMemReserve_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; + } + return zeVirtualMemReserve_ptr(hContext, pStart, size, pptr); +} + +__attribute__((weak)) ze_result_t zeVirtualMemSetAccessAttribute( + ze_context_handle_t hContext, + const void* ptr, + size_t size, + ze_memory_access_attribute_t access) { + if (!load_level_zero_library() || !zeVirtualMemSetAccessAttribute_ptr) { + return ZE_RESULT_ERROR_UNINITIALIZED; + } + return zeVirtualMemSetAccessAttribute_ptr(hContext, ptr, size, access); +} +} diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 916524073c..02b6e4e59d 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -12,6 +12,7 @@ import torch import torch.distributed as c10d +import torch.distributed._functional_collectives as _functional_collectives if not c10d.is_available() or not c10d.is_xccl_available(): print("c10d XCCL not available, skipping tests", file=sys.stderr) @@ -364,6 +365,31 @@ def test_nan_assert(self, type): # reset env os.environ["TORCH_XCCL_NAN_CHECK"] = "0" + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_oom(self): + pg = self._create_process_group_xccl() + dp_ranks = range(0, self.world_size) + dp_group = c10d.new_group(dp_ranks) + device = torch.device(f"xpu:{self.rank}") + torch.xpu.set_device(device) + + shape = (16384 * 2, 16384 * 2) + weight = torch.ones(shape, device=device).half() + gradient = torch.zeros(shape, device=device).half() + ret = torch.randn(shape, device=device).half() + + for iter in range(50): + output = torch.empty_like(ret) + output = ret + weight + gradient + ret = torch.nn.functional.linear(output, weight=ret) + dist.all_reduce(ret, op=dist.ReduceOp.SUM) + torch.xpu.synchronize() + self.assertLess( + torch.xpu.max_memory_allocated(), + torch.xpu.max_memory_reserved() * 2, + ) + class CommTest(MultiProcessTestCase): @property @@ -626,6 +652,86 @@ def test_all_gather_into_tensor(self): tensor.view(torch.float32), ) + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_unwaited(self) -> None: + # Verify that the process can terminate gracefully + # even with unwaited tensors + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + + # Case 1: Run collectives under context manager, and don't call wait on them. + with _functional_collectives.allow_inflight_collective_as_graph_input_ctx(): + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + input = torch.full( + (10240, 10240), float(self.rank), device=f"xpu:{self.rank}" + ) + dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True) + # Non-functional collectives run under the context manager is registered in the work registry. + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1) + # Running another collective on the same tensor should still work + dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2) + + # Case 2: Run collectives not under context manager, and don't call wait on them. + # NOTE: Here we intentionally test memory-stressed case. + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2) + for _ in range(50000): + input = torch.full( + (1024, 1024), float(self.rank), device=f"xpu:{self.rank}" + ) + dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True) + # Work registry size is unchanged, since non-functional collectives not run under + # the context manager is not registered in the work registry. + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_wait_tensor(self) -> None: + # Verify that c10d_functional.wait_tensor() can be invoked on + # output tensor of non-functional collective + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="xccl", rank=self.rank, world_size=self.world_size, store=store + ) + + # Case 1: under context manager (i.e. work is registered in registry) + with _functional_collectives.allow_inflight_collective_as_graph_input_ctx(): + input1 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}") + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + dist.all_reduce(input1, op=dist.ReduceOp.SUM, async_op=True) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1) + torch.ops.c10d_functional.wait_tensor(input1) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + + input2 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}") + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + work = dist.all_reduce(input2, op=dist.ReduceOp.SUM, async_op=True) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1) + work.wait() + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + self.assertEqual(input1, input2) + + # Case 2: not under context manager (i.e. work is not registered in registry) + input1 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}") + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + dist.all_reduce(input1, op=dist.ReduceOp.SUM, async_op=True) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + # this does not take effect, since the underlying wait_tensor() logic would not + # be able to find the corresponding work object (because it's not registered in registry) + torch.ops.c10d_functional.wait_tensor(input1) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + + input2 = torch.full((10, 10), float(self.rank), device=f"xpu:{self.rank}") + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + work = dist.all_reduce(input2, op=dist.ReduceOp.SUM, async_op=True) + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + work.wait() + self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) + self.assertEqual(input1, input2) + instantiate_parametrized_tests(ProcessGroupXCCLTest) diff --git a/test/xpu/distributed/test_symmetric_memory_xccl.py b/test/xpu/distributed/test_symmetric_memory_xccl.py new file mode 100644 index 0000000000..37f5d3e6da --- /dev/null +++ b/test/xpu/distributed/test_symmetric_memory_xccl.py @@ -0,0 +1,85 @@ +import torch +import torch.distributed as dist +from test_c10d_xccl import init_multigpu_helper, requires_xccl +from torch.distributed._symmetric_memory import ( + _fused_all_gather_matmul_fallback, + _fused_matmul_reduce_scatter_fallback, +) + +from torch.testing._internal.common_distributed import MultiProcContinuousTest +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests +) + +@instantiate_parametrized_tests +class AsyncTPTest(MultiProcContinuousTest): + @property + def device(self) -> torch.device: + return torch.device("xpu", self.rank) + + def _init_process(self): + torch.xpu.set_device(self.device) + torch.manual_seed(42 + self.rank) + torch.use_deterministic_algorithms(True) + torch.set_deterministic_debug_mode("warn") + torch.utils.deterministic.fill_uninitialized_memory = True + + @requires_xccl() + @parametrize("gather_dim", [0, 1]) + def test_fused_all_gather_matmul(self, gather_dim: int) -> None: + self._init_process() + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A_shard = torch.rand(BATCH, M // self.world_size, K, device="xpu") + Bs = [torch.rand(K, N, device="xpu") for _ in range(3)] + + ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback( + A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name + ) + ag_output_1, mm_outputs_1 = torch.ops.symm_mem.fused_all_gather_matmul( + A_shard, Bs, gather_dim=gather_dim, group_name=group.group_name + ) + + self.assertEqual(ag_output_0, ag_output_1) + self.assertEqual(ag_output_0.stride(), ag_output_1.stride()) + for mm_output_0, mm_output_1 in zip(mm_outputs_0, mm_outputs_1): + self.assertEqual(mm_output_0, mm_output_1) + self.assertEqual(mm_output_0.stride(), mm_output_1.stride()) + + @requires_xccl() + @parametrize("scatter_dim", [0, 1]) + def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None: + self._init_process() + + BATCH = 8 + M = 64 + N = 16 + K = 32 + group = dist.group.WORLD + rank = self.rank + + torch.manual_seed(42 + rank) + A = torch.rand(BATCH, M, K, device="xpu") + B = torch.rand(K, N, device="xpu") + + output_0 = _fused_matmul_reduce_scatter_fallback( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + output_1 = torch.ops.symm_mem.fused_matmul_reduce_scatter( + A, B, "avg", scatter_dim=scatter_dim, group_name=group.group_name + ) + + self.assertEqual(output_0, output_1) + self.assertEqual(output_0.stride(), output_1.stride()) + + +if __name__ == "__main__": + run_tests() diff --git a/test/xpu/extended/run_test_with_skip_mtl.py b/test/xpu/extended/run_test_with_skip_mtl.py index e2683b64ce..4b6eaa5960 100644 --- a/test/xpu/extended/run_test_with_skip_mtl.py +++ b/test/xpu/extended/run_test_with_skip_mtl.py @@ -19,5 +19,6 @@ os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +test_command.extend(["--junit-xml", "./op_extended.xml"]) res = pytest.main(test_command) sys.exit(res) diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8a..4965406165 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -26,6 +26,10 @@ def run(test_command): test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +test_command = ["python", "distributed/test_c10d_xccl.py"] +res += run(test_command) +test_command = ["python", "distributed/test_symmetric_memory_xccl.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: diff --git a/test/xpu/run_test_win_with_skip_mtl.py b/test/xpu/run_test_win_with_skip_mtl.py index 4ba56dba33..06a4849a6c 100644 --- a/test/xpu/run_test_win_with_skip_mtl.py +++ b/test/xpu/run_test_win_with_skip_mtl.py @@ -18,6 +18,7 @@ sys.stdout = StringIO() test_command = ["-k", skip_options, "../../../../test/test_xpu.py", "-v"] +test_command.extend(["--junit-xml", "./test_xpu.xml"]) res = pytest.main(test_command) output = sys.stdout.getvalue()