FlagAI-Open · ftgreat · Aug 20, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 28, 2025
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -1,6 +1,6 @@
 name: Pylint
 
-on: [push]
+on: [push, pull_request]
 
 jobs:
   build:
@@ -10,14 +10,37 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
+        pip install pylint pyyaml
+    - name: Determine changed Python files
+      id: diff
+      shell: bash
+      run: |
+        set -euo pipefail
+        git fetch origin main --depth=1 || true
+        CHANGED=$(git diff --name-only origin/main...HEAD | grep -E '\.py$' || true)
+        echo "files<<EOF" >> "$GITHUB_OUTPUT"
+        echo "$CHANGED" >> "$GITHUB_OUTPUT"
+        echo "EOF" >> "$GITHUB_OUTPUT"
+        if [ -n "$CHANGED" ]; then
+          echo "any=true" >> "$GITHUB_OUTPUT"
+        else
+          echo "any=false" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Analysing the code with pylint (changed files only)
+      if: steps.diff.outputs.any == 'true'
+      shell: bash
       run: |
-        pylint $(git ls-files '*.py')
+        set -euo pipefail
+        echo "Changed Python files:" 
+        printf "%s\n" "${{ steps.diff.outputs.files }}"
+        # Lint only errors; ignore refactor/convention/warning categories
+        printf "%s\n" "${{ steps.diff.outputs.files }}" | xargs -r pylint --disable=C,R,W
diff --git a/configs/pyedu-integration/README.md b/configs/pyedu-integration/README.md
@@ -0,0 +1,37 @@
+# PyEdu Dataset Integration Configuration
+
+This directory contains configuration files for integrating the pyedu dataset into OpenSeek training pipelines.
+
+## About PyEdu Dataset
+
+PyEdu is a high-quality educational Python code dataset that is a subset of the "stack-edu" subset from smollm-corpus. Key characteristics:
+
+- **Source**: https://huggingface.co/datasets/Leon-Leee/unofficial-pyedu
+- **Size**: ~6GB
+- **Quality**: High-quality according to the smollm-v2 tech report
+- **Content**: Educational Python code examples
+- **Use Cases**: Further training, annealing, or synthesizing datasets
+
+## Configuration Files
+
+- `config_pyedu_integration.yaml`: Experiment-level configuration for pyedu integration
+- `train/train_pyedu_integration.yaml`: Task-level configuration with pyedu dataset included
+
+## Usage
+
+To use these configurations:
+
+1. Ensure the pyedu dataset is downloaded and preprocessed
+2. Update the `dataset_base_dir` in the config file to point to your data directory
+3. Adjust the data mixture ratios as needed for your specific training requirements
+4. Run training with the provided configuration files
+
+## Data Mixture Strategy
+
+The pyedu dataset can be integrated into existing training pipelines in several ways:
+
+1. **Annealing**: Use pyedu for final training phases to improve code understanding
+2. **Synthesis**: Use pyedu as source material for generating additional training data
+3. **Mixed Training**: Include pyedu as part of the regular training data mixture
+
+The configuration provided uses a balanced approach, incorporating pyedu alongside existing code datasets.
diff --git a/configs/pyedu-integration/config_pyedu_integration.yaml b/configs/pyedu-integration/config_pyedu_integration.yaml
@@ -0,0 +1,21 @@
+experiment:
+  exp_name: "pyedu-integration"
+  exp_dir: "./exp_out"
+  runner:
+    backend: "flagscale"
+    task: "train"
+    no_shared_fs: false
+
+  # Dataset configuration
+  dataset_base_dir: "/path/to/your/datasets"  # Update this path
+
+  # Training configuration
+  save_steps: 1000
+  load: null
+  ckpt_format: "torch"
+  seed: 42
+
+  # Distributed training settings (adjust based on your setup)
+  nnodes: 1
+  nproc_per_node: 8
+  hostfile: null
diff --git a/configs/pyedu-integration/train/train_pyedu_integration.yaml b/configs/pyedu-integration/train/train_pyedu_integration.yaml
@@ -0,0 +1,155 @@
+system:
+  recompute_method: "uniform"
+  recompute_granularity: "full"
+  recompute_num_layers: 6
+  moe_router_dtype: fp32
+  no_shared_fs: ${experiment.runner.no_shared_fs}
+  num_workers: 4
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  expert_model_parallel_size: 1
+  context_parallel_size: 1
+  disable_bias_linear: true
+  reset_position_ids: True
+  reset_attention_mask: True
+  qk_layernorm: true
+  sequence_parallel: true
+  use_distributed_optimizer: true
+  overlap_grad_reduce: true
+  overlap_param_gather: true
+  finetune: false
+  precision:
+    bf16: true
+    attention_softmax_in_fp32: true
+    accumulate_allreduce_grads_in_fp32: true
+  logging:
+    log_interval: 1
+    tensorboard_log_interval: 1
+    wandb_project: ${experiment.exp_name}
+    wandb_exp_name: ${experiment.exp_name}
+    log_timers_to_tensorboard: true
+    log_validation_ppl_to_tensorboard: true
+    log_throughput: true
+    log_params_norm: true
+    log_num_zeros_in_grad: true
+    log_memory_to_tensorboard: true
+  checkpoint:
+    save_interval: ${experiment.save_steps}
+    load: ${experiment.load}
+    ckpt_format: ${experiment.ckpt_format}
+
+model:
+  transformer_impl: transformer_engine
+  num_layers: 6
+  hidden_size: 1280
+  num_attention_heads: 10
+  group_query_attention: false
+  num_query_groups: 10
+  seq_length: 4096
+  max_position_embeddings: 4096
+  norm_epsilon: 1e-6
+  use_rotary_position_embeddings: true
+  rotary_base: 1000000
+  swiglu: true
+  normalization: RMSNorm
+  init_method_std: 6e-3
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  clip_grad: 1.0
+  position_embedding_type: rope
+  untie_embeddings_and_output_weights: false
+  no_position_embedding: true
+  no_rope_fusion: true
+
+  # mla args
+  multi_latent_attention: true
+  kv_lora_rank: 512
+  qk_head_dim: 128
+  qk_pos_emb_head_dim: 64
+  v_head_dim: 128
+
+  # moe args
+  ffn_hidden_size: 7168
+  moe_ffn_hidden_size: 896
+  moe_grouped_gemm: true
+  moe_shared_expert_intermediate_size: 1792
+  num_experts: 64
+  moe_router_load_balancing_type: "seq_aux_loss"
+  moe_router_score_function: sigmoid
+  moe_router_enable_expert_bias: true
+  moe_router_bias_update_rate: 0.001
+  moe_aux_loss_coeff: 0.0001
+  moe_layer_freq: "[0]+[1]*5"
+  moe_router_num_groups: 1
+  moe_router_group_topk: 1
+  moe_router_topk: 6
+  moe_router_topk_scaling_factor: 2.446
+  moe_token_dispatcher_type: "alltoall"
+
+  # training
+  seed: ${experiment.seed}
+  micro_batch_size: 1
+  global_batch_size: 1024
+  eval_iters: 0
+  train_samples: 24576000 # 100B tokens
+
+  optimizer:
+    weight_decay: 0.1
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 3.0e-3
+      min_lr: 3.0e-4
+      lr_warmup_samples: 2048000
+      lr_decay_style: cosine
+
+data:
+  # PyEdu integration: Enhanced code training with educational Python dataset
+  data_path:
+    # Existing code datasets (reduced weights to make room for pyedu)
+    - 0.8000  # Reduced from original weight
+    - ${experiment.dataset_base_dir}/code-high/part_13_text_document
+    - 0.9000  # Reduced from original weight
+    - ${experiment.dataset_base_dir}/code-low/part_36_text_document
+    - 0.8000  # Reduced from original weight
+    - ${experiment.dataset_base_dir}/code-mid/part_37_text_document
+
+    # PyEdu dataset integration - high-quality educational Python code
+    - 1.2000  # Higher weight for high-quality educational content
+    - ${experiment.dataset_base_dir}/pyedu/pyedu_text_document
+
+    # Existing stack dataset (maintained)
+    - 0.4229
+    - ${experiment.dataset_base_dir}/stack/018_00000_text_document
+
+    # CoT synthesis for code (enhanced with pyedu influence)
+    - 0.5000  # Slightly increased for better code reasoning
+    - ${experiment.dataset_base_dir}/cot_synthesis2_code-high/4_text_document
+    - 0.7000  # Slightly increased
+    - ${experiment.dataset_base_dir}/cot_synthesis2_code-low/6_text_document
+    - 0.9000  # Slightly increased
+    - ${experiment.dataset_base_dir}/cot_synthesis2_code-mid/23_text_document
+
+    # Math datasets (maintained for balanced training)
+    - 1.8165
+    - ${experiment.dataset_base_dir}/math-high/part_04_text_document
+    - 1.6940
+    - ${experiment.dataset_base_dir}/math-low/part_10_text_document
+    - 1.6311
+    - ${experiment.dataset_base_dir}/math-mid/part_07_text_document
+
+    # ArXiv for scientific content (maintained)
+    - 0.6414
+    - ${experiment.dataset_base_dir}/arxiv/007_00000_text_document
+
+    # Wiki for general knowledge (maintained)
+    - 0.4202
+    - ${experiment.dataset_base_dir}/wiki/012_00000_text_document
+
+  split: 1
+  no_mmap_bin_files: true
+  tokenizer:
+    tokenizer_type: QwenTokenizerFS
+    tokenizer_path: ../hf_openseek/tokenizer
+    vocab_size: 151851
+    make_vocab_size_divisible_by: 64
diff --git a/docs/Data.md b/docs/Data.md
@@ -25,6 +25,7 @@ The pre-training dataset is mainly composed of collected and selected open sourc
 - https://huggingface.co/datasets/OpenCoder-LLM/opc-fineweb-code-corpus
 - https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus
 - https://huggingface.co/datasets/bigcode/the-stack-v2
+- https://huggingface.co/datasets/Leon-Leee/unofficial-pyedu (pyedu: high-quality educational Python code subset from smollm-corpus, ~6GB)
 
 ## 2. Data Synthesis
 - **Preliminary Reasoning Data Synthesis**: semantically segment, summarize, organize CoT process, and summarize queries on the original pre-trained documents. take {Query, CoT process, Original document} as one training sample.

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+# OpenSeek Tests
diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc
diff --git a/tests/__pycache__/test_pyedu_integration.cpython-312-pytest-8.4.1.pyc b/tests/__pycache__/test_pyedu_integration.cpython-312-pytest-8.4.1.pyc