From 8c32883045e43ddcba43abfdc38a768b3cad312f Mon Sep 17 00:00:00 2001
From: weissenh <50957092+weissenh@users.noreply.github.com>
Date: Thu, 4 Dec 2025 03:39:47 +0100
Subject: [PATCH 1/3] Added Weiwei Sun catch-all id and more info on
 weiwei-sun-sd

ORCID copied from two papers in XML noted by issue submitter as theirs
---
 data/yaml/name_variants.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index 277227befc..f634791c53 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -9609,7 +9609,12 @@
   variants:
   - {first: Sheng-He, last: Sun}
 - canonical: {first: Weiwei, last: Sun}
-  comment: Shandong University
+  comment: May refer to several people
+  id: weiwei-sun
+- canonical: {first: Weiwei, last: Sun}
+  comment: CMU
+  degree: Carnegie Mellon University
+  orcid: 0000-0002-4817-9500
   id: weiwei-sun-sd
 - canonical: {first: Yufang, last: Sun}
   variants:

From f9d7e3d3bc40da4d65fcf2cfc06933d43a0a9f9d Mon Sep 17 00:00:00 2001
From: weissenh <50957092+weissenh@users.noreply.github.com>
Date: Thu, 4 Dec 2025 03:44:07 +0100
Subject: [PATCH 2/3] Adding weiwei-sun catch-all ID to all papers, more papers
 to weiwei-sun-sd

comparing to their homepage publication list
---
 data/xml/2011.mtsummit.xml | 2 +-
 data/xml/2020.acl.xml      | 6 +++---
 data/xml/2020.emnlp.xml    | 2 +-
 data/xml/2021.bea.xml      | 2 +-
 data/xml/2021.cl.xml       | 2 +-
 data/xml/2021.naacl.xml    | 2 +-
 data/xml/2023.cxgsnlp.xml  | 2 +-
 data/xml/2024.acl.xml      | 4 ++--
 data/xml/2024.cl.xml       | 2 +-
 data/xml/2024.emnlp.xml    | 2 +-
 data/xml/2024.lchange.xml  | 2 +-
 data/xml/2024.lrec.xml     | 4 ++--
 data/xml/2024.naacl.xml    | 2 +-
 data/xml/2025.babylm.xml   | 2 +-
 data/xml/2025.cmcl.xml     | 2 +-
 data/xml/2025.emnlp.xml    | 2 +-
 data/xml/2025.findings.xml | 2 +-
 data/xml/2025.tacl.xml     | 2 +-
 data/xml/2025.winlp.xml    | 2 +-
 data/xml/C08.xml           | 2 +-
 data/xml/C10.xml           | 2 +-
 data/xml/C12.xml           | 2 +-
 data/xml/D09.xml           | 2 +-
 data/xml/D11.xml           | 2 +-
 data/xml/D17.xml           | 2 +-
 data/xml/D18.xml           | 2 +-
 data/xml/I13.xml           | 2 +-
 data/xml/J16.xml           | 4 ++--
 data/xml/J19.xml           | 2 +-
 data/xml/K17.xml           | 4 ++--
 data/xml/K18.xml           | 2 +-
 data/xml/K19.xml           | 2 +-
 data/xml/L10.xml           | 2 +-
 data/xml/P09.xml           | 2 +-
 data/xml/P10.xml           | 4 ++--
 data/xml/P11.xml           | 2 +-
 data/xml/P12.xml           | 4 ++--
 data/xml/P14.xml           | 2 +-
 data/xml/P15.xml           | 2 +-
 data/xml/P17.xml           | 4 ++--
 data/xml/P18.xml           | 6 +++---
 data/xml/P19.xml           | 2 +-
 data/xml/Q13.xml           | 2 +-
 data/xml/S14.xml           | 2 +-
 data/xml/S15.xml           | 2 +-
 data/xml/S19.xml           | 2 +-
 data/xml/W08.xml           | 2 +-
 data/xml/W10.xml           | 2 +-
 data/xml/Y09.xml           | 2 +-
 49 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/data/xml/2011.mtsummit.xml b/data/xml/2011.mtsummit.xml
index aea98aade5..852ad2adae 100644
--- a/data/xml/2011.mtsummit.xml
+++ b/data/xml/2011.mtsummit.xml
@@ -429,7 +429,7 @@
     <paper id="46">
       <title>Generating Virtual Parallel Corpus: A Compatibility Centric Method</title>
       <author><first>Jia</first><last>Xu</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <url hash="09f1e154">2011.mtsummit-papers.46</url>
       <bibkey>xu-sun-2011-generating</bibkey>
     </paper>
diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
index fab064d73e..b82a74885c 100644
--- a/data/xml/2020.acl.xml
+++ b/data/xml/2020.acl.xml
@@ -5081,7 +5081,7 @@
     <paper id="377">
       <title>Exact yet Efficient Graph Parsing, Bi-directional Locality and the Constructivist Hypothesis</title>
       <author><first>Yajie</first><last>Ye</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>4100–4110</pages>
       <abstract>A key problem in processing graph-based meaning representations is graph parsing, i.e. computing all possible derivations of a given graph according to a (competence) grammar. We demonstrate, for the first time, that exact graph parsing can be efficient for large graphs and with large Hyperedge Replacement Grammars (HRGs). The advance is achieved by exploiting locality as terminal edge-adjacency in HRG rules. In particular, we highlight the importance of 1) a terminal edge-first parsing strategy, 2) a categorization of a subclass of HRG, i.e. what we call Weakly Regular Graph Grammar, and 3) distributing argument-structures to both lexical and phrasal rules.</abstract>
       <url hash="231b0280">2020.acl-main.377</url>
@@ -8167,7 +8167,7 @@
     <paper id="605">
       <title>Parsing into Variable-in-situ Logico-Semantic Graphs</title>
       <author><first>Yufei</first><last>Chen</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>6772–6782</pages>
       <abstract>We propose variable-in-situ logico-semantic graphs to bridge the gap between semantic graph and logical form parsing. The new type of graph-based meaning representation allows us to include analysis for scope-related phenomena, such as quantification, negation and modality, in a way that is consistent with the state-of-the-art underspecification approach. Moreover, the well-formedness of such a graph is clear, since model-theoretic interpretation is available. We demonstrate the effectiveness of this new perspective by developing a new state-of-the-art semantic parser for English Resource Semantics. At the core of this parser is a novel neural graph rewriting system which combines the strengths of Hyperedge Replacement Grammar, a knowledge-intensive model, and Graph Neural Networks, a data-intensive model. Our parser achieves an accuracy of 92.39% in terms of elementary dependency match, which is a 2.88 point improvement over the best data-driven model in the literature. The output of our parser is highly coherent: at least 91% graphs are valid, in that they allow at least one sound scope-resolved logical form.</abstract>
       <url hash="bd78729c">2020.acl-main.605</url>
@@ -8179,7 +8179,7 @@
     <paper id="606">
       <title>Semantic Parsing for <fixed-case>E</fixed-case>nglish as a Second Language</title>
       <author><first>Yuanyuan</first><last>Zhao</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Junjie</first><last>Cao</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>6783–6794</pages>
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
index 13e7ec9f9e..c14beadf7a 100644
--- a/data/xml/2020.emnlp.xml
+++ b/data/xml/2020.emnlp.xml
@@ -1405,7 +1405,7 @@
       <title>Coding Textual Inputs Boosts the Accuracy of Neural Networks</title>
       <author><first>Abdul Rafae</first><last>Khan</last></author>
       <author><first>Jia</first><last>Xu</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>1350–1360</pages>
       <abstract>Natural Language Processing (NLP) tasks are usually performed word by word on textual inputs. We can use arbitrary symbols to represent the linguistic meaning of a word and use these symbols as inputs. As “alternatives” to a text representation, we introduce Soundex, MetaPhone, NYSIIS, logogram to NLP, and develop fixed-output-length coding and its extension using Huffman coding. Each of those codings combines different character/digital sequences and constructs a new vocabulary based on codewords. We find that the integration of those codewords with text provides more reliable inputs to Neural-Network-based NLP systems through redundancy than text-alone inputs. Experiments demonstrate that our approach outperforms the state-of-the-art models on the application of machine translation, language modeling, and part-of-speech tagging. The source code is available at <url>https://github.com/abdulrafae/coding_nmt</url>.</abstract>
       <url hash="53b5562f">2020.emnlp-main.104</url>
diff --git a/data/xml/2021.bea.xml b/data/xml/2021.bea.xml
index 495e34b31b..d6250ac0fa 100644
--- a/data/xml/2021.bea.xml
+++ b/data/xml/2021.bea.xml
@@ -27,7 +27,7 @@
       <author><first>Mengyu</first><last>Zhang</last></author>
       <author><first>Weiqi</first><last>Wang</last></author>
       <author><first>Shuqiao</first><last>Sun</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>1–10</pages>
       <abstract>This paper studies Negation Scope Resolution (NSR) for Chinese as a Second Language (CSL), which shows many unique characteristics that distinguish itself from “standard” Chinese. We annotate a new moderate-sized corpus that covers two background L1 languages, viz. English and Japanese. We build a neural NSR system, which achieves a new state-of-the-art accuracy on English benchmark data. We leverage this system to gauge how successful NSR for CSL can be. Different native language backgrounds of language learners result in unequal cross-lingual transfer, which has a significant impact on processing second language data. In particular, manual annotation, empirical evaluation and error analysis indicate two non-obvious facts: 1) L2-Chinese, L1-Japanese data are more difficult to analyze and thus annotate than L2-Chinese, L1-English data; 2) computational models trained on L2-Chinese, L1-Japanese data perform better than models trained on L2-Chinese, L1-English data.</abstract>
       <url hash="f3f9c66d">2021.bea-1.1</url>
diff --git a/data/xml/2021.cl.xml b/data/xml/2021.cl.xml
index 30abce1010..a14b1b6bec 100644
--- a/data/xml/2021.cl.xml
+++ b/data/xml/2021.cl.xml
@@ -35,7 +35,7 @@
       <title>Comparing Knowledge-Intensive and Data-Intensive Models for <fixed-case>E</fixed-case>nglish Resource Semantic Parsing</title>
       <author><first>Junjie</first><last>Cao</last></author>
       <author><first>Zi</first><last>Lin</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <doi>10.1162/coli_a_00395</doi>
       <abstract>In this work, we present a phenomenon-oriented comparative analysis of the two dominant approaches in English Resource Semantic (ERS) parsing: classic, knowledge-intensive and neural, data-intensive models. To reflect state-of-the-art neural NLP technologies, a factorization-based parser is introduced that can produce Elementary Dependency Structures much more accurately than previous data-driven parsers. We conduct a suite of tests for different linguistic phenomena to analyze the grammatical competence of different parsers, where we show that, despite comparable performance overall, knowledge- and data-intensive models produce different types of errors, in a way that can be explained by their theoretical properties. This analysis is beneficial to in-depth evaluation of several representative parsing techniques and leads to new directions for parser development.</abstract>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index e10381b920..b71d9aeef1 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -5929,7 +5929,7 @@
       <author><first>Yiyang</first><last>Hou</last></author>
       <author><first>Yajie</first><last>Ye</last></author>
       <author><first>Li</first><last>Liang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>5554–5566</pages>
       <abstract>Universal Semantic Tagging aims to provide lightweight unified analysis for all languages at the word level. Though the proposed annotation scheme is conceptually promising, the feasibility is only examined in four Indo–European languages. This paper is concerned with extending the annotation scheme to handle Mandarin Chinese and empirically study the plausibility of unifying meaning representations for multiple languages. We discuss a set of language-specific semantic phenomena, propose new annotation specifications and build a richly annotated corpus. The corpus consists of 1100 English–Chinese parallel sentences, where compositional semantic analysis is available for English, and another 1000 Chinese sentences which has enriched syntactic analysis. By means of the new annotations, we also evaluate a series of neural tagging models to gauge how successful semantic tagging can be: accuracies of 92.7% and 94.6% are obtained for Chinese and English respectively. The English tagging performance is remarkably better than the state-of-the-art by 7.7%.</abstract>
       <url hash="63148334">2021.naacl-main.440</url>
diff --git a/data/xml/2023.cxgsnlp.xml b/data/xml/2023.cxgsnlp.xml
index a49bba20d5..f0f9089678 100644
--- a/data/xml/2023.cxgsnlp.xml
+++ b/data/xml/2023.cxgsnlp.xml
@@ -56,7 +56,7 @@
     <paper id="5">
       <title>Constructivist Tokenization for <fixed-case>E</fixed-case>nglish</title>
       <author><first>Allison</first><last>Fan</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>36-40</pages>
       <abstract>This paper revisits tokenization from a theoretical perspective, and argues for the necessity of a constructivist approach to tokenization for semantic parsing and modeling language acquisition. We consider two problems: (1) (semi-) automatically converting existing lexicalist annotations, e.g. those of the Penn TreeBank, into constructivist annotations, and (2) automatic tokenization of raw texts. We demonstrate that (1) a heuristic rule-based constructivist tokenizer is able to yield relatively satisfactory accuracy when gold standard Penn TreeBank part-of-speech tags are available, but that some manual annotations are still necessary to obtain gold standard results, and (2) a neural tokenizer is able to provide accurate automatic constructivist tokenization results from raw character sequences. Our research output also includes a set of high-quality morpheme-tokenized corpora, which enable the training of computational models that more closely align with language comprehension and acquisition.</abstract>
       <url hash="0963dbef">2023.cxgsnlp-1.5</url>
diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index dbf7f28fe5..b49466bd67 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -1815,7 +1815,7 @@
     <paper id="129">
       <title><fixed-case>MEFT</fixed-case>: Memory-Efficient Fine-Tuning through Sparse Adapter</title>
       <author><first>Jitai</first><last>Hao</last></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Xin</first><last>Xin</last></author>
       <author><first>Qi</first><last>Meng</last><affiliation>Microsoft and Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Chinese Academy of Sciences</affiliation></author>
       <author><first>Zhumin</first><last>Chen</last><affiliation>Shandong University</affiliation></author>
@@ -5509,7 +5509,7 @@
       <title>Generate-then-Ground in Retrieval-Augmented Generation for Multi-hop Question Answering</title>
       <author><first>Zhengliang</first><last>Shi</last></author>
       <author><first>Shuo</first><last>Zhang</last></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Shen</first><last>Gao</last><affiliation>University of Electronic Science and Technology of China</affiliation></author>
       <author><first>Pengjie</first><last>Ren</last><affiliation>Shandong University</affiliation></author>
       <author><first>Zhumin</first><last>Chen</last><affiliation>Shandong University</affiliation></author>
diff --git a/data/xml/2024.cl.xml b/data/xml/2024.cl.xml
index ef42300f9e..7322d9bab6 100644
--- a/data/xml/2024.cl.xml
+++ b/data/xml/2024.cl.xml
@@ -169,7 +169,7 @@
       <author><first>Wenxi</first><last>Li</last></author>
       <author><first>Yutong</first><last>Zhang</last></author>
       <author><first>Guy</first><last>Emerson</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <doi>10.1162/coli_a_00504</doi>
       <abstract>Divergence of languages observed at the surface level is a major challenge encountered by multilingual data representation, especially when typologically distant languages are involved. Drawing inspiration from a formalist Chomskyan perspective towards language universals, Universal Grammar (UG), this article uses deductively pre-defined universals to analyze a multilingually heterogeneous phenomenon, event nominals. In this way, deeper universality of event nominals beneath their huge divergence in different languages is uncovered, which empowers us to break barriers between languages and thus extend insights from some synthetic languages to a non-inflectional language, Mandarin Chinese. Our empirical investigation also demonstrates this UG-inspired schema is effective: With its assistance, the inter-annotator agreement (IAA) for identifying event nominals in Mandarin grows from 88.02% to 94.99%, and automatic detection of event-reading nominalizations on the newly-established data achieves an accuracy of 94.76% and an F1 score of 91.3%, which significantly surpass those achieved on the pre-existing resource by 9.8% and 5.2%, respectively. Our systematic analysis also sheds light on nominal semantic role labeling. By providing a clear definition and classification on arguments of event nominal, the IAA of this task significantly increases from 90.46% to 98.04%.</abstract>
       <pages>535–561</pages>
diff --git a/data/xml/2024.emnlp.xml b/data/xml/2024.emnlp.xml
index 6c914acb15..694b2781f3 100644
--- a/data/xml/2024.emnlp.xml
+++ b/data/xml/2024.emnlp.xml
@@ -10858,7 +10858,7 @@
     </paper>
     <paper id="778">
       <title><fixed-case>MAIR</fixed-case>: A Massive Benchmark for Evaluating Instructed Retrieval</title>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Zhengliang</first><last>Shi</last></author>
       <author><first>Wu Jiu</first><last>Long</last></author>
       <author><first>Lingyong</first><last>Yan</last><affiliation>Baidu Inc.</affiliation></author>
diff --git a/data/xml/2024.lchange.xml b/data/xml/2024.lchange.xml
index f075328c7f..851746c4cc 100644
--- a/data/xml/2024.lchange.xml
+++ b/data/xml/2024.lchange.xml
@@ -144,7 +144,7 @@
     <paper id="12">
       <title><fixed-case>E</fixed-case>tymo<fixed-case>L</fixed-case>ink: A Structured <fixed-case>E</fixed-case>nglish Etymology Dataset</title>
       <author><first>Yuan</first><last>Gao</last><affiliation>University of Cambridge</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>126-136</pages>
       <abstract/>
       <url hash="183478e8">2024.lchange-1.12</url>
diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index 56d1229604..9ea359719e 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -8545,7 +8545,7 @@
     <paper id="722">
       <title>How Large Language Models Encode Context Knowledge? A Layer-Wise Probing Study</title>
       <author><first>Tianjie</first><last>Ju</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Wei</first><last>Du</last></author>
       <author><first>Xinwei</first><last>Yuan</last></author>
       <author><first>Zhaochun</first><last>Ren</last></author>
@@ -9217,7 +9217,7 @@
       <title>Improving the Robustness of Large Language Models via Consistency Alignment</title>
       <author><first>Yukun</first><last>Zhao</last></author>
       <author><first>Lingyong</first><last>Yan</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Guoliang</first><last>Xing</last></author>
       <author><first>Shuaiqiang</first><last>Wang</last></author>
       <author><first>Chong</first><last>Meng</last></author>
diff --git a/data/xml/2024.naacl.xml b/data/xml/2024.naacl.xml
index aec119e8ec..8b3711c6fc 100644
--- a/data/xml/2024.naacl.xml
+++ b/data/xml/2024.naacl.xml
@@ -5577,7 +5577,7 @@
       <title>Knowing What <fixed-case>LLM</fixed-case>s <fixed-case>DO</fixed-case> <fixed-case>NOT</fixed-case> Know: A Simple Yet Effective Self-Detection Method</title>
       <author><first>Yukun</first><last>Zhao</last></author>
       <author><first>Lingyong</first><last>Yan</last><affiliation>Baidu Inc.</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Guoliang</first><last>Xing</last></author>
       <author><first>Chong</first><last>Meng</last><affiliation>Baidu</affiliation></author>
       <author><first>Shuaiqiang</first><last>Wang</last><affiliation>Baidu Inc.</affiliation></author>
diff --git a/data/xml/2025.babylm.xml b/data/xml/2025.babylm.xml
index d27a390e68..fc0bcb2b40 100644
--- a/data/xml/2025.babylm.xml
+++ b/data/xml/2025.babylm.xml
@@ -172,7 +172,7 @@
       <author orcid="0000-0001-7201-7387"><first>Suchir</first><last>Salhan</last><affiliation>University of Cambridge</affiliation></author>
       <author><first>Andrew</first><last>Caines</last><affiliation>University of Cambridge</affiliation></author>
       <author><first>Paula</first><last>Buttery</last><affiliation>University of Cambridge</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>160-174</pages>
       <abstract>Cross-lingual extensions of the BabyLM Shared Task beyond English incentivise the development of Small Language Models that simulate a much wider range of language acquisition scenarios, including code-switching, simultaneous and successive bilingualism and second language acquisition. However, to our knowledge, there is no benchmark of the formal competence of cognitively-inspired models of L2 acquisition, or <b>L2LMs</b>. To address this, we introduce a <b>Benchmark of Learner Interlingual Syntactic Structure (BLiSS)</b>. BLiSS consists of 1.5M naturalistic minimal pairs dataset derived from errorful sentence–correction pairs in parallel learner corpora. These are systematic patterns –overlooked by standard benchmarks of the formal competence of Language Models – which we use to evaluate L2LMs trained in a variety of training regimes on specific properties of L2 learner language to provide a linguistically-motivated framework for controlled measure of the interlanguage competence of L2LMs.</abstract>
       <url hash="2e7f6503">2025.babylm-main.13</url>
diff --git a/data/xml/2025.cmcl.xml b/data/xml/2025.cmcl.xml
index 68dfe028d9..54482f554c 100644
--- a/data/xml/2025.cmcl.xml
+++ b/data/xml/2025.cmcl.xml
@@ -68,7 +68,7 @@
       <title>Profiling neural grammar induction on morphemically tokenised child-directed speech</title>
       <author><first>Mila</first><last>Marcheva</last></author>
       <author orcid="0000-0003-3840-7618"><first>Theresa</first><last>Biberauer</last><affiliation>University of the Western Cape, University of Stellenbosch and University of Cambridge</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>47-54</pages>
       <abstract>We investigate the performance of state-of-the-art (SotA) neural grammar induction (GI) models on a morphemically tokenised English dataset based on the CHILDES treebank (Pearl and Sprouse, 2013). Using implementations from Yang et al. (2021a), we train models and evaluate them with the standard F1 score. We introduce novel evaluation metrics—depth-of-morpheme and sibling-of-morpheme—which measure phenomena around bound morpheme attachment. Our results reveal that models with the highest F1 scores do not necessarily induce linguistically plausible structures for bound morpheme attachment, highlighting a key challenge for cognitively plausible GI.</abstract>
       <url hash="df2c343b">2025.cmcl-1.7</url>
diff --git a/data/xml/2025.emnlp.xml b/data/xml/2025.emnlp.xml
index 111a52800d..bb106964d4 100644
--- a/data/xml/2025.emnlp.xml
+++ b/data/xml/2025.emnlp.xml
@@ -8339,7 +8339,7 @@
     <paper id="557">
       <title>A Computational Simulation of Language Production in First Language Acquisition</title>
       <author><first>Yuan</first><last>Gao</last><affiliation>Computer Laboratory</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>11003-11017</pages>
       <abstract>We introduce a computational framework for modeling child language production, focusing on the acquisition of the competence to map meaning onto linguistic form. Our approach uses graphs to formalize meaning and Synchronous Hyperedge Replacement Grammar (SHRG) to formalize the syntax–semantics interface.The setup provides computationally-sound induction algorithms of statistical grammar knowledge. We induce SHRGs solely from semantic graphs, and the resulting interpretable grammars are evaluated by their ability to generate utterances—providing a novel controlled paradigm to simulate child language acquisition.A notable finding is that unsupervised statistical learning (analogous to children’s implicit learning mechanisms) performs as well as the corresponding supervised oracle when a proper symbolic grammar is assumed (reflecting knowledge gained via comprehension).</abstract>
       <url hash="05c490e5">2025.emnlp-main.557</url>
diff --git a/data/xml/2025.findings.xml b/data/xml/2025.findings.xml
index 12955bcd07..d9296307ae 100644
--- a/data/xml/2025.findings.xml
+++ b/data/xml/2025.findings.xml
@@ -23938,7 +23938,7 @@
       <title>Compositional Syntactico-<fixed-case>S</fixed-case>em<fixed-case>B</fixed-case>anking for <fixed-case>E</fixed-case>nglish as a Second or Foreign Language</title>
       <author><first>Wenxi</first><last>Li</last></author>
       <author><first>Xihao</first><last>Wang</last><affiliation>Peking University</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>24395-24406</pages>
       <abstract>Despite the widespread use of English as a Second or Foreign Language (ESFL), developing syntactico-semantic representations for it is limited — the irregularities in ESFL complicate systematic composition and subsequently the derivation of its semantics.This paper draws on constructivism and proposes a novel Synchronous Hyperedge Replacement Grammar (SHRG)-based constructivist approach to address the challenges. By using constructions as fundamental units, this approach not only accommodates both the idiosyncrasies and the compositional nature of ESFL, but also bridges the gap between literal cues and intended meaning.The feasibility of this constructivist approach is demonstrated using real ESFL data, resulting in a gold-standard, medium-sized syntactico-semantic bank that covers a wide range of ESFL phenomena.</abstract>
       <url hash="4db22a47">2025.findings-acl.1252</url>
diff --git a/data/xml/2025.tacl.xml b/data/xml/2025.tacl.xml
index b264506acf..d1b350f6cb 100644
--- a/data/xml/2025.tacl.xml
+++ b/data/xml/2025.tacl.xml
@@ -301,7 +301,7 @@
     <paper id="21">
       <title>Phonetic Reconstruction of the Consonant System of Middle <fixed-case>C</fixed-case>hinese via Mixed Integer Optimization</title>
       <author><first>Xiaoxi</first><last>Luo</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <doi>10.1162/tacl_a_00742</doi>
       <abstract>This paper is concerned with phonetic reconstruction of the consonant system of Middle Chinese. We propose to cast the problem as a Mixed Integer Programming problem, which is able to automatically explore homophonic information from ancient rhyme dictionaries and phonetic information from modern Chinese dialects, the descendants of Middle Chinese. Numerical evaluation on a wide range of synthetic and real data demonstrates the effectiveness and robustness of the new method. We apply the method to information from Guǎngyùn and 20 modern Chinese dialects to obtain a new phonetic reconstruction result. A linguistically motivated discussion of this result is also provided.1</abstract>
       <pages>424–441</pages>
diff --git a/data/xml/2025.winlp.xml b/data/xml/2025.winlp.xml
index 3afe2f5a53..4a345860fb 100644
--- a/data/xml/2025.winlp.xml
+++ b/data/xml/2025.winlp.xml
@@ -97,7 +97,7 @@
     <paper id="12">
       <title>Transfer learning for dependency parsing of <fixed-case>V</fixed-case>edic <fixed-case>S</fixed-case>anskrit</title>
       <author><first>Abhiram</first><last>Vinjamuri</last></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last><affiliation>University of Cambridge</affiliation></author>
       <pages>50-55</pages>
       <abstract>This paper focuses on data-driven dependency parsing for Vedic Sanskrit. We propose and evaluate a transfer learning approach that benefits from syntactic analysis of typologically related languages, including Ancient Greek and Latin, and a descendant language - Classical Sanskrit. Experiments on the Vedic TreeBank demonstrate the effectiveness of cross-lingual transfer, demonstrating improvements from the biaffine baseline as well as outperforming the current state of the art benchmark, the deep contextualised self-training algorithm, across a wide range of experimental setups.</abstract>
       <url hash="c5ae9809">2025.winlp-main.12</url>
diff --git a/data/xml/C08.xml b/data/xml/C08.xml
index a138e5c127..2bd98d89cd 100644
--- a/data/xml/C08.xml
+++ b/data/xml/C08.xml
@@ -915,7 +915,7 @@
     </paper>
     <paper id="105">
       <title>Prediction of Maximal Projection for Semantic Role Labeling</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <author><first>Haifeng</first><last>Wang</last></author>
       <pages>833–840</pages>
diff --git a/data/xml/C10.xml b/data/xml/C10.xml
index 7e9c795e37..e6e681398f 100644
--- a/data/xml/C10.xml
+++ b/data/xml/C10.xml
@@ -2697,7 +2697,7 @@
     </paper>
     <paper id="139">
       <title>Word-based and Character-based Word Segmentation Models: Comparison and Combination</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>1211–1219</pages>
       <url hash="b07372f5">C10-2139</url>
       <bibkey>sun-2010-word</bibkey>
diff --git a/data/xml/C12.xml b/data/xml/C12.xml
index 8c0ddfc2de..ab6619d39b 100644
--- a/data/xml/C12.xml
+++ b/data/xml/C12.xml
@@ -506,7 +506,7 @@
     <paper id="53">
       <title>Semantic Cohesion Model for Phrase-Based <fixed-case>SMT</fixed-case></title>
       <author><first>Minwei</first><last>Feng</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Hermann</first><last>Ney</last></author>
       <pages>867–878</pages>
       <url hash="9d562f8a">C12-1053</url>
diff --git a/data/xml/D09.xml b/data/xml/D09.xml
index 955abf17af..f38b215b4d 100644
--- a/data/xml/D09.xml
+++ b/data/xml/D09.xml
@@ -1354,7 +1354,7 @@
     </paper>
     <paper id="153">
       <title><fixed-case>C</fixed-case>hinese Semantic Role Labeling with Shallow Parsing</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <author><first>Meng</first><last>Wang</last></author>
       <author><first>Xin</first><last>Wang</last></author>
diff --git a/data/xml/D11.xml b/data/xml/D11.xml
index ed927e2356..bbe7aca489 100644
--- a/data/xml/D11.xml
+++ b/data/xml/D11.xml
@@ -854,7 +854,7 @@
     </paper>
     <paper id="90">
       <title>Enhancing <fixed-case>C</fixed-case>hinese Word Segmentation Using Unlabeled Data</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Jia</first><last>Xu</last></author>
       <pages>970–979</pages>
       <url hash="2ff972c9">D11-1090</url>
diff --git a/data/xml/D17.xml b/data/xml/D17.xml
index b7a427eda4..d8fcd228ad 100644
--- a/data/xml/D17.xml
+++ b/data/xml/D17.xml
@@ -46,7 +46,7 @@
       <title>Quasi-Second-Order Parsing for 1-Endpoint-Crossing, Pagenumber-2 Graphs</title>
       <author><first>Junjie</first><last>Cao</last></author>
       <author><first>Sheng</first><last>Huang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>24–34</pages>
       <url hash="804175ee">D17-1003</url>
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index da4ff395fd..be8194ee82 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -5229,7 +5229,7 @@
       <author><first>Zi</first><last>Lin</last></author>
       <author><first>Yuguang</first><last>Duan</last></author>
       <author><first>Yuanyuan</first><last>Zhao</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>3793–3802</pages>
       <url hash="c4999b6a">D18-1414</url>
diff --git a/data/xml/I13.xml b/data/xml/I13.xml
index d8138c48fa..eb7b119038 100644
--- a/data/xml/I13.xml
+++ b/data/xml/I13.xml
@@ -205,7 +205,7 @@
     </paper>
     <paper id="21">
       <title>Capturing Long-distance Dependencies in Sequence Models: A Case Study of <fixed-case>C</fixed-case>hinese Part-of-speech Tagging</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaochang</first><last>Peng</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>180–188</pages>
diff --git a/data/xml/J16.xml b/data/xml/J16.xml
index 8e57bca125..9c74eafa92 100644
--- a/data/xml/J16.xml
+++ b/data/xml/J16.xml
@@ -157,7 +157,7 @@
       <title>Transition-Based Parsing for Deep Dependency Structures</title>
       <author><first>Xun</first><last>Zhang</last></author>
       <author><first>Yantao</first><last>Du</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>353–389</pages>
       <doi>10.1162/COLI_a_00252</doi>
@@ -166,7 +166,7 @@
     </paper>
     <paper id="2">
       <title>Towards Accurate and Efficient <fixed-case>C</fixed-case>hinese Part-of-Speech Tagging</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>391–419</pages>
       <doi>10.1162/COLI_a_00253</doi>
diff --git a/data/xml/J19.xml b/data/xml/J19.xml
index 8e9ddf762b..a40f7530cf 100644
--- a/data/xml/J19.xml
+++ b/data/xml/J19.xml
@@ -40,7 +40,7 @@
     </paper>
     <paper id="3">
       <title>Parsing <fixed-case>C</fixed-case>hinese Sentences with Grammatical Relations</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Yufei</first><last>Chen</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <author><first>Meichun</first><last>Liu</last></author>
diff --git a/data/xml/K17.xml b/data/xml/K17.xml
index 9a2129b259..34b720aa33 100644
--- a/data/xml/K17.xml
+++ b/data/xml/K17.xml
@@ -62,7 +62,7 @@
     </paper>
     <paper id="5">
       <title>Parsing for Grammatical Relations via Graph Merging</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Yantao</first><last>Du</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>26–35</pages>
@@ -400,7 +400,7 @@
     <paper id="35">
       <title>The Covert Helps Parse the Overt</title>
       <author><first>Xun</first><last>Zhang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>343–353</pages>
       <url hash="e92a3e90">K17-1035</url>
diff --git a/data/xml/K18.xml b/data/xml/K18.xml
index 0ebc70288e..4e7aa09c5f 100644
--- a/data/xml/K18.xml
+++ b/data/xml/K18.xml
@@ -624,7 +624,7 @@
       <author><first>Sheng</first><last>Huang</last></author>
       <author><first>Fang</first><last>Wang</last></author>
       <author><first>Junjie</first><last>Cao</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>562–572</pages>
       <url hash="51182282">K18-1054</url>
diff --git a/data/xml/K19.xml b/data/xml/K19.xml
index bd138b9624..83941adea9 100644
--- a/data/xml/K19.xml
+++ b/data/xml/K19.xml
@@ -1437,7 +1437,7 @@
       <title>Peking at <fixed-case>MRP</fixed-case> 2019: Factorization- and Composition-Based Parsing for Elementary Dependency Structures</title>
       <author><first>Yufei</first><last>Chen</last></author>
       <author><first>Yajie</first><last>Ye</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>166–176</pages>
       <abstract>We design, implement and evaluate two semantic parsers, which represent factorization- and composition-based approaches respectively, for Elementary Dependency Structures (EDS) at the CoNLL 2019 Shared Task on Cross-Framework Meaning Representation Parsing. The detailed evaluation of the two parsers gives us a new perception about parsing into linguistically enriched meaning representations: current neural EDS parsers are able to reach an accuracy at the inter-annotator agreement level in the same-epoch-and-domain setup.</abstract>
       <url hash="a8a2198e">K19-2016</url>
diff --git a/data/xml/L10.xml b/data/xml/L10.xml
index c40bae1f6b..01ab4ea9f8 100644
--- a/data/xml/L10.xml
+++ b/data/xml/L10.xml
@@ -2427,7 +2427,7 @@
       <author><first>Meng</first><last>Wang</last></author>
       <author><first>Chu-Ren</first><last>Huang</last></author>
       <author><first>Shiwen</first><last>Yu</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <title>Automatic Acquisition of <fixed-case>C</fixed-case>hinese Novel Noun Compounds</title>
       <url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/377_Paper.pdf</url>
       <abstract>Automatic acquisition of novel compounds is notoriously difficult because most novel compounds have relatively low frequency in a corpus. The current study proposes a new method to deal with the novel compound acquisition challenge. We model this task as a two-class classification problem in which a candidate compound is either classified as a compound or a non-compound. A machine learning method using SVM, incorporating two types of linguistically motivated features: semantic features and character features, is applied to identify rare but valid noun compounds. We explore two kinds of training data: one is virtual training data which is obtained by three statistical scores, i.e. co-occurrence frequency, mutual information and dependent ratio, from the frequent compounds; the other is real training data which is randomly selected from the infrequent compounds. We conduct comparative experiments, and the experimental results show that even with limited direct evidence in the corpus for the novel compounds, we can make full use of the typical frequent compounds to help in the discovery of the novel compounds.</abstract>
diff --git a/data/xml/P09.xml b/data/xml/P09.xml
index 079b46f867..1f7d14af70 100644
--- a/data/xml/P09.xml
+++ b/data/xml/P09.xml
@@ -1703,7 +1703,7 @@
     </paper>
     <paper id="64">
       <title>Prediction of Thematic Rank for Structured Semantic Role Labeling</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <author><first>Meng</first><last>Wang</last></author>
       <pages>253–256</pages>
diff --git a/data/xml/P10.xml b/data/xml/P10.xml
index 06ae476425..c76e0cb729 100644
--- a/data/xml/P10.xml
+++ b/data/xml/P10.xml
@@ -1570,7 +1570,7 @@
     </paper>
     <paper id="19">
       <title>Semantics-Driven Shallow Parsing for <fixed-case>C</fixed-case>hinese Semantic Role Labeling</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>103–108</pages>
       <url hash="6dbf2661">P10-2019</url>
       <bibkey>sun-2010-semantics</bibkey>
@@ -1679,7 +1679,7 @@
     </paper>
     <paper id="31">
       <title>Improving <fixed-case>C</fixed-case>hinese Semantic Role Labeling with Rich Syntactic Features</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>168–172</pages>
       <url hash="c2e11772">P10-2031</url>
       <bibkey>sun-2010-improving</bibkey>
diff --git a/data/xml/P11.xml b/data/xml/P11.xml
index 0e021e9cfb..eeaacc6fc4 100644
--- a/data/xml/P11.xml
+++ b/data/xml/P11.xml
@@ -1262,7 +1262,7 @@
     </paper>
     <paper id="139">
       <title>A Stacked Sub-Word Model for Joint <fixed-case>C</fixed-case>hinese Word Segmentation and Part-of-Speech Tagging</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>1385–1394</pages>
       <url hash="474b1f1a">P11-1139</url>
       <bibkey>sun-2011-stacked</bibkey>
diff --git a/data/xml/P12.xml b/data/xml/P12.xml
index 560b819f3f..1f7215c75c 100644
--- a/data/xml/P12.xml
+++ b/data/xml/P12.xml
@@ -240,7 +240,7 @@
     </paper>
     <paper id="25">
       <title>Reducing Approximation and Estimation Errors for <fixed-case>C</fixed-case>hinese Lexical Processing with Heterogeneous Annotations</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>232–241</pages>
       <url hash="5af6a0d8">P12-1025</url>
@@ -249,7 +249,7 @@
     </paper>
     <paper id="26">
       <title>Capturing Paradigmatic and Syntagmatic Lexical Relations: Towards Accurate <fixed-case>C</fixed-case>hinese Part-of-Speech Tagging</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Hans</first><last>Uszkoreit</last></author>
       <pages>242–252</pages>
       <url hash="7509d5df">P12-1026</url>
diff --git a/data/xml/P14.xml b/data/xml/P14.xml
index add35d25d2..4b43f59790 100644
--- a/data/xml/P14.xml
+++ b/data/xml/P14.xml
@@ -508,7 +508,7 @@
     </paper>
     <paper id="42">
       <title>Grammatical Relations in <fixed-case>C</fixed-case>hinese: <fixed-case>GB</fixed-case>-Ground Extraction and Data-Driven Parsing</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Yantao</first><last>Du</last></author>
       <author><first>Xin</first><last>Kou</last></author>
       <author><first>Shuoyang</first><last>Ding</last></author>
diff --git a/data/xml/P15.xml b/data/xml/P15.xml
index 81a76291df..063dc44dd4 100644
--- a/data/xml/P15.xml
+++ b/data/xml/P15.xml
@@ -1604,7 +1604,7 @@
     <paper id="149">
       <title>A Data-Driven, Factorization Parser for <fixed-case>CCG</fixed-case> Dependency Structures</title>
       <author><first>Yantao</first><last>Du</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>1545–1555</pages>
       <url hash="4a78abf9">P15-1149</url>
diff --git a/data/xml/P17.xml b/data/xml/P17.xml
index 7975ee129d..86fddb8bcf 100644
--- a/data/xml/P17.xml
+++ b/data/xml/P17.xml
@@ -1019,7 +1019,7 @@ two word-vectors results in a vector that is only a small angle away from the ve
     </paper>
     <paper id="77">
       <title>Semantic Dependency Parsing via Book Embedding</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Junjie</first><last>Cao</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>828–838</pages>
@@ -2494,7 +2494,7 @@ two word-vectors results in a vector that is only a small angle away from the ve
       <title>Parsing to 1-Endpoint-Crossing, Pagenumber-2 Graphs</title>
       <author><first>Junjie</first><last>Cao</last></author>
       <author><first>Sheng</first><last>Huang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>2110–2120</pages>
       <url hash="2667fea7">P17-1193</url>
diff --git a/data/xml/P18.xml b/data/xml/P18.xml
index b81ed978c4..5c2d1ee64f 100644
--- a/data/xml/P18.xml
+++ b/data/xml/P18.xml
@@ -541,7 +541,7 @@
     <paper id="38">
       <title>Accurate <fixed-case>SHRG</fixed-case>-Based Semantic Parsing</title>
       <author><first>Yufei</first><last>Chen</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>408–418</pages>
       <abstract>We demonstrate that an SHRG-based parser can produce semantic graphs much more accurately than previously shown, by relating synchronous production rules to the syntacto-semantic composition process. Our parser achieves an accuracy of 90.35 for EDS (89.51 for DMRS) in terms of elementary dependency match, which is a 4.87 (5.45) point improvement over the best existing data-driven model, indicating, in our view, the importance of linguistically-informed derivation for data-driven semantic parsing. This accuracy is equivalent to that of English Resource Grammar guided models, suggesting that (recurrent) neural network models are able to effectively learn deep linguistic knowledge from annotations.</abstract>
@@ -2428,7 +2428,7 @@
     <paper id="179">
       <title>Language Generation via <fixed-case>DAG</fixed-case> Transduction</title>
       <author><first>Yajie</first><last>Ye</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>1928–1937</pages>
       <abstract>A DAG automaton is a formal device for manipulating graphs. By augmenting a DAG automaton with transduction rules, a DAG transducer has potential applications in fundamental NLP tasks. In this paper, we propose a novel DAG transducer to perform graph-to-program transformation. The target structure of our transducer is a program licensed by a declarative programming language rather than linguistic structures. By executing such a program, we can easily get a surface string. Our transducer is designed especially for natural language generation (NLG) from type-logical semantic graphs. Taking Elementary Dependency Structures, a format of English Resource Semantics, as input, our NLG system achieves a BLEU-4 score of 68.07. This remarkable result demonstrates the feasibility of applying a DAG transducer to resolve NLG, as well as the effectiveness of our design.</abstract>
@@ -3418,7 +3418,7 @@
       <title>Pre- and In-Parsing Models for Neural Empty Category Detection</title>
       <author><first>Yufei</first><last>Chen</last></author>
       <author><first>Yuanyuan</first><last>Zhao</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>2687–2696</pages>
       <abstract>Motivated by the positive impact of empty category on syntactic parsing, we study neural models for pre- and in-parsing detection of empty category, which has not previously been investigated. We find several non-obvious facts: (a) BiLSTM can capture non-local contextual information which is essential for detecting empty categories, (b) even with a BiLSTM, syntactic information is still able to enhance the detection, and (c) automatic detection of empty categories improves parsing quality for overt words. Our neural ECD models outperform the prior state-of-the-art by significant margins.</abstract>
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index 0cdab3ee01..9f6e19df3c 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -9584,7 +9584,7 @@
       <title>Graph-Based Meaning Representations: Design and Processing</title>
       <author><first>Alexander</first><last>Koller</last></author>
       <author><first>Stephan</first><last>Oepen</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <pages>6–11</pages>
       <abstract>This tutorial is on representing and processing sentence meaning in the form of labeled directed graphs. The tutorial will (a) briefly review relevant background in formal and linguistic semantics; (b) semi-formally define a unified abstract view on different flavors of semantic graphs and associated terminology; (c) survey common frameworks for graph-based meaning representation and available graph banks; and (d) offer a technical overview of a representative selection of different parsing approaches.</abstract>
       <url hash="db6fb488">P19-4002</url>
diff --git a/data/xml/Q13.xml b/data/xml/Q13.xml
index 1239c280e3..534947a5cc 100644
--- a/data/xml/Q13.xml
+++ b/data/xml/Q13.xml
@@ -294,7 +294,7 @@
     </paper>
     <paper id="25">
       <title>Data-driven, <fixed-case>PCFG</fixed-case>-based and Pseudo-<fixed-case>PCFG</fixed-case>-based Models for <fixed-case>C</fixed-case>hinese Dependency Parsing</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <doi>10.1162/tacl_a_00229</doi>
       <abstract>We present a comparative study of transition-, graph- and PCFG-based models aimed at illuminating more precisely the likely contribution of CFGs in improving Chinese dependency parsing accuracy, especially by combining heterogeneous models. Inspired by the impact of a constituency grammar on dependency parsing, we propose several strategies to acquire pseudo CFGs only from dependency annotations. Compared to linguistic grammars learned from rich phrase-structure treebanks, well designed pseudo grammars achieve similar parsing accuracy and have equivalent contributions to parser ensemble. Moreover, pseudo grammars increase the diversity of base models; therefore, together with all other models, further improve system combination. Based on automatic POS tagging, our final model achieves a UAS of 87.23%, resulting in a significant improvement of the state of the art.</abstract>
diff --git a/data/xml/S14.xml b/data/xml/S14.xml
index d2a50d657e..132311ced4 100644
--- a/data/xml/S14.xml
+++ b/data/xml/S14.xml
@@ -1111,7 +1111,7 @@
       <title><fixed-case>P</fixed-case>eking: Profiling Syntactic Tree Parsing Techniques for Semantic Graph Parsing</title>
       <author><first>Yantao</first><last>Du</last></author>
       <author><first>Fan</first><last>Zhang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>459–464</pages>
       <url hash="68f1eeee">S14-2080</url>
diff --git a/data/xml/S15.xml b/data/xml/S15.xml
index f5edc6e07f..ae509e88fe 100644
--- a/data/xml/S15.xml
+++ b/data/xml/S15.xml
@@ -2061,7 +2061,7 @@
       <author><first>Yantao</first><last>Du</last></author>
       <author><first>Fan</first><last>Zhang</last></author>
       <author><first>Xun</first><last>Zhang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>927–931</pages>
       <url hash="b87f1aba">S15-2154</url>
diff --git a/data/xml/S19.xml b/data/xml/S19.xml
index a1961b1730..124bc7ef76 100644
--- a/data/xml/S19.xml
+++ b/data/xml/S19.xml
@@ -565,7 +565,7 @@
       <author><first>Sheng</first><last>Huang</last></author>
       <author><first>Abdul Rafae</first><last>Khan</last></author>
       <author><first>Shengqiang</first><last>Zhang</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Jia</first><last>Xu</last></author>
       <pages>92–96</pages>
       <abstract>This paper describes the systems of the CUNY-PKU team in SemEval 2019 Task 1: Cross-lingual Semantic Parsing with UCCA. We introduce a novel model by applying a cascaded MLP and BiLSTM model. Then, we ensemble multiple system-outputs by reparsing. In particular, we introduce a new decoding algorithm for building the UCCA representation. Our system won the first place in one track (French-20K-Open), second places in four tracks (English-Wiki-Open, English-20K-Open, German-20K-Open, and German-20K-Closed), and third place in one track (English-20K-Closed), among all seven tracks.</abstract>
diff --git a/data/xml/W08.xml b/data/xml/W08.xml
index bdb0037d16..22ee5c71d8 100644
--- a/data/xml/W08.xml
+++ b/data/xml/W08.xml
@@ -3268,7 +3268,7 @@
     </paper>
     <paper id="35">
       <title>The Integration of Dependency Relation Classification and Semantic Role Labeling Using Bilayer Maximum Entropy <fixed-case>M</fixed-case>arkov Models</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Hongzhan</first><last>Li</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <pages>243–247</pages>
diff --git a/data/xml/W10.xml b/data/xml/W10.xml
index 48108b6691..0aaf57fe8d 100644
--- a/data/xml/W10.xml
+++ b/data/xml/W10.xml
@@ -6481,7 +6481,7 @@
     </paper>
     <paper id="44">
       <title>Discriminative Parse Reranking for <fixed-case>C</fixed-case>hinese with Homogeneous and Heterogeneous Annotations</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Rui</first><last>Wang</last></author>
       <author><first>Yi</first><last>Zhang</last></author>
       <url hash="c12cf47f">W10-4144</url>
diff --git a/data/xml/Y09.xml b/data/xml/Y09.xml
index 23942388a9..23146c1246 100644
--- a/data/xml/Y09.xml
+++ b/data/xml/Y09.xml
@@ -507,7 +507,7 @@
     </paper>
     <paper id="11">
       <title><fixed-case>C</fixed-case>hinese Function Tag Labeling</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <pages>530–539</pages>
       <url hash="42dd4305">Y09-2011</url>

From 52e7fef5b346e9c36323d5c0e7167434231ee17b Mon Sep 17 00:00:00 2001
From: weissenh <50957092+weissenh@users.noreply.github.com>
Date: Thu, 4 Dec 2025 03:56:25 +0100
Subject: [PATCH 3/3] Adding weiwei-sun catch-all ID to editor paper too

---
 data/xml/2020.iwpt.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2020.iwpt.xml b/data/xml/2020.iwpt.xml
index 1929816bde..0643dc22d7 100644
--- a/data/xml/2020.iwpt.xml
+++ b/data/xml/2020.iwpt.xml
@@ -8,7 +8,7 @@
       <editor><first>Stephan</first><last>Oepen</last></editor>
       <editor><first>Kenji</first><last>Sagae</last></editor>
       <editor><first>Djamé</first><last>Seddah</last></editor>
-      <editor><first>Weiwei</first><last>Sun</last></editor>
+      <editor id="weiwei-sun"><first>Weiwei</first><last>Sun</last></editor>
       <editor><first>Anders</first><last>Søgaard</last></editor>
       <editor><first>Reut</first><last>Tsarfaty</last></editor>
       <editor><first>Dan</first><last>Zeman</last></editor>