quarto/Writing/coding-age-ai.bib

@video{bycloud2025LLMsRLRevelation,
  entrysubtype = {video},
  title = {The {{LLM}}'s {{RL Revelation We Didn}}'t {{See Coming}}},
  editor = {{bycloud}},
  editortype = {director},
  date = {2025-06-24},
  url = {https://www.youtube.com/watch?v=z3awgfU4yno},
  urldate = {2025-06-26},
  keywords = {paper: coding_age_ai}
}
% == BibLateX quality report for bycloud2025LLMsRLRevelation:
% Unexpected field 'title'
% Unexpected field 'editor'
% Unexpected field 'editortype'
% ? Title looks like it was stored in title-case in Zotero
% ? unused Library catalog ("YouTube")
% ? unused Running time ("15:33")

@online{DeepSeek-AI2025DeepSeekR1IncentivizingReasoning,
  title = {{{DeepSeek-R1}}: {{Incentivizing Reasoning Capability}} in {{LLMs}} via {{Reinforcement Learning}}},
  shorttitle = {{{DeepSeek-R1}}},
  author = {DeepSeek-AI and Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and Zhang, Xiaokang and Yu, Xingkai and Wu, Yu and Wu, Z. F. and Gou, Zhibin and Shao, Zhihong and Li, Zhuoshu and Gao, Ziyi and Liu, Aixin and Xue, Bing and Wang, Bingxuan and Wu, Bochao and Feng, Bei and Lu, Chengda and Zhao, Chenggang and Deng, Chengqi and Zhang, Chenyu and Ruan, Chong and Dai, Damai and Chen, Deli and Ji, Dongjie and Li, Erhang and Lin, Fangyun and Dai, Fucong and Luo, Fuli and Hao, Guangbo and Chen, Guanting and Li, Guowei and Zhang, H. and Bao, Han and Xu, Hanwei and Wang, Haocheng and Ding, Honghui and Xin, Huajian and Gao, Huazuo and Qu, Hui and Li, Hui and Guo, Jianzhong and Li, Jiashi and Wang, Jiawei and Chen, Jingchang and Yuan, Jingyang and Qiu, Junjie and Li, Junlong and Cai, J. L. and Ni, Jiaqi and Liang, Jian and Chen, Jin and Dong, Kai and Hu, Kai and Gao, Kaige and Guan, Kang and Huang, Kexin and Yu, Kuai and Wang, Lean and Zhang, Lecong and Zhao, Liang and Wang, Litong and Zhang, Liyue and Xu, Lei and Xia, Leyi and Zhang, Mingchuan and Zhang, Minghua and Tang, Minghui and Li, Meng and Wang, Miaojun and Li, Mingming and Tian, Ning and Huang, Panpan and Zhang, Peng and Wang, Qiancheng and Chen, Qinyu and Du, Qiushi and Ge, Ruiqi and Zhang, Ruisong and Pan, Ruizhe and Wang, Runji and Chen, R. J. and Jin, R. L. and Chen, Ruyi and Lu, Shanghao and Zhou, Shangyan and Chen, Shanhuang and Ye, Shengfeng and Wang, Shiyu and Yu, Shuiping and Zhou, Shunfeng and Pan, Shuting and Li, S. S. and Zhou, Shuang and Wu, Shaoqing and Ye, Shengfeng and Yun, Tao and Pei, Tian and Sun, Tianyu and Wang, T. and Zeng, Wangding and Zhao, Wanjia and Liu, Wen and Liang, Wenfeng and Gao, Wenjun and Yu, Wenqin and Zhang, Wentao and Xiao, W. L. and An, Wei and Liu, Xiaodong and Wang, Xiaohan and Chen, Xiaokang and Nie, Xiaotao and Cheng, Xin and Liu, Xin and Xie, Xin and Liu, Xingchao and Yang, Xinyu and Li, Xinyuan and Su, Xuecheng and Lin, Xuheng and Li, X. Q. and Jin, Xiangyue and Shen, Xiaojin and Chen, Xiaosha and Sun, Xiaowen and Wang, Xiaoxiang and Song, Xinnan and Zhou, Xinyi and Wang, Xianzu and Shan, Xinxia and Li, Y. K. and Wang, Y. Q. and Wei, Y. X. and Zhang, Yang and Xu, Yanhong and Li, Yao and Zhao, Yao and Sun, Yaofeng and Wang, Yaohui and Yu, Yi and Zhang, Yichao and Shi, Yifan and Xiong, Yiliang and He, Ying and Piao, Yishi and Wang, Yisong and Tan, Yixuan and Ma, Yiyang and Liu, Yiyuan and Guo, Yongqiang and Ou, Yuan and Wang, Yuduan and Gong, Yue and Zou, Yuheng and He, Yujia and Xiong, Yunfan and Luo, Yuxiang and You, Yuxiang and Liu, Yuxuan and Zhou, Yuyang and Zhu, Y. X. and Xu, Yanhong and Huang, Yanping and Li, Yaohui and Zheng, Yi and Zhu, Yuchen and Ma, Yunxian and Tang, Ying and Zha, Yukun and Yan, Yuting and Ren, Z. Z. and Ren, Zehui and Sha, Zhangli and Fu, Zhe and Xu, Zhean and Xie, Zhenda and Zhang, Zhengyan and Hao, Zhewen and Ma, Zhicheng and Yan, Zhigang and Wu, Zhiyu and Gu, Zihui and Zhu, Zijia and Liu, Zijun and Li, Zilin and Xie, Ziwei and Song, Ziyang and Pan, Zizheng and Huang, Zhen and Xu, Zhipeng and Zhang, Zhongyu and Zhang, Zhen},
  date = {2025-01-22},
  eprint = {2501.12948},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2501.12948},
  url = {http://arxiv.org/abs/2501.12948},
  urldate = {2025-06-26},
  abstract = {We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,paper: coding_age_ai},
  file = {/home/drezil/Zotero/storage/5Z5755Y9/DeepSeek-AI et al. - 2025 - DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.pdf;/home/drezil/Zotero/storage/SDRBNKQ3/2501.html}
}
% == BibLateX quality report for DeepSeek-AI2025DeepSeekR1IncentivizingReasoning:
% ? Title looks like it was stored in title-case in Zotero
% ? unused Number ("arXiv:2501.12948")

@online{Liu2025UnderstandingR1ZeroLikeTraining,
  title = {Understanding {{R1-Zero-Like Training}}: {{A Critical Perspective}}},
  shorttitle = {Understanding {{R1-Zero-Like Training}}},
  author = {Liu, Zichen and Chen, Changyu and Li, Wenjun and Qi, Penghui and Pang, Tianyu and Du, Chao and Lee, Wee Sun and Lin, Min},
  date = {2025-03-26},
  eprint = {2503.20783},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2503.20783},
  url = {http://arxiv.org/abs/2503.20783},
  urldate = {2025-06-26},
  abstract = {DeepSeek-R1-Zero has shown that reinforcement learning (RL) at scale can directly enhance the reasoning capabilities of LLMs without supervised fine-tuning. In this work, we critically examine R1-Zero-like training by analyzing its two core components: base models and RL. We investigate a wide range of base models, including DeepSeek-V3-Base, to understand how pretraining characteristics influence RL performance. Our analysis reveals that DeepSeek-V3-Base already exhibit ''Aha moment'', while Qwen2.5 base models demonstrate strong reasoning capabilities even without prompt templates, suggesting potential pretraining biases. Additionally, we identify an optimization bias in Group Relative Policy Optimization (GRPO), which artificially increases response length (especially for incorrect outputs) during training. To address this, we introduce Dr. GRPO, an unbiased optimization method that improves token efficiency while maintaining reasoning performance. Leveraging these insights, we present a minimalist R1-Zero recipe that achieves 43.3\% accuracy on AIME 2024 with a 7B base model, establishing a new state-of-the-art. Our code is available at https://github.com/sail-sg/understand-r1-zero.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,paper: coding_age_ai},
  file = {/home/drezil/Zotero/storage/YFH83QF5/Liu et al. - 2025 - Understanding R1-Zero-Like Training A Critical Perspective.pdf;/home/drezil/Zotero/storage/VPK84FQT/2503.html}
}
% == BibLateX quality report for Liu2025UnderstandingR1ZeroLikeTraining:
% ? Title looks like it was stored in title-case in Zotero
% ? unused Number ("arXiv:2503.20783")

@online{Mukherjee2025ReinforcementLearningFinetunes,
  title = {Reinforcement {{Learning Finetunes Small Subnetworks}} in {{Large Language Models}}},
  author = {Mukherjee, Sagnik and Yuan, Lifan and Hakkani-Tur, Dilek and Peng, Hao},
  date = {2025-05-16},
  eprint = {2505.11711},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2505.11711},
  url = {http://arxiv.org/abs/2505.11711},
  urldate = {2025-06-26},
  abstract = {Reinforcement learning (RL) yields substantial improvements in large language models (LLMs) downstream task performance and alignment with human values. Surprisingly, such large gains result from updating only a small subnetwork comprising just 5 percent to 30 percent of the parameters, with the rest effectively unchanged. We refer to this phenomenon as parameter update sparsity induced by RL. It is observed across all 7 widely used RL algorithms (e.g., PPO, GRPO, DPO) and all 10 LLMs from different families in our experiments. This sparsity is intrinsic and occurs without any explicit sparsity promoting regularizations or architectural constraints. Finetuning the subnetwork alone recovers the test accuracy, and, remarkably, produces a model nearly identical to the one obtained via full finetuning. The subnetworks from different random seeds, training data, and even RL algorithms show substantially greater overlap than expected by chance. Our analysis suggests that this sparsity is not due to updating only a subset of layers, instead, nearly all parameter matrices receive similarly sparse updates. Moreover, the updates to almost all parameter matrices are nearly full-rank, suggesting RL updates a small subset of parameters that nevertheless span almost the full subspaces that the parameter matrices can represent. We conjecture that the this update sparsity can be primarily attributed to training on data that is near the policy distribution, techniques that encourage the policy to remain close to the pretrained model, such as the KL regularization and gradient clipping, have limited impact.},
  pubstate = {prepublished},
  keywords = {Computer Science - Machine Learning,paper: coding_age_ai},
  file = {/home/drezil/Zotero/storage/HFEQA4GW/Mukherjee et al. - 2025 - Reinforcement Learning Finetunes Small Subnetworks in Large Language Models.pdf;/home/drezil/Zotero/storage/QTJPABH6/2505.html}
}
% == BibLateX quality report for Mukherjee2025ReinforcementLearningFinetunes:
% ? Title looks like it was stored in title-case in Zotero
% ? unused Number ("arXiv:2505.11711")

@online{Shao2025SpuriousRewardsRethinking,
  title = {Spurious {{Rewards}}: {{Rethinking Training Signals}} in {{RLVR}}},
  shorttitle = {Spurious {{Rewards}}},
  author = {Shao, Rulin and Li, Shuyue Stella and Xin, Rui and Geng, Scott and Wang, Yiping and Oh, Sewoong and Du, Simon Shaolei and Lambert, Nathan and Min, Sewon and Krishna, Ranjay and Tsvetkov, Yulia and Hajishirzi, Hannaneh and Koh, Pang Wei and Zettlemoyer, Luke},
  date = {2025-06-12},
  eprint = {2506.10947},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2506.10947},
  url = {http://arxiv.org/abs/2506.10947},
  urldate = {2025-06-26},
  abstract = {We show that reinforcement learning with verifiable rewards (RLVR) can elicit strong mathematical reasoning in certain models even with spurious rewards that have little, no, or even negative correlation with the correct answer. For example, RLVR improves MATH-500 performance for Qwen2.5-Math-7B in absolute points by 21.4\% (random reward), 13.8\% (format reward), 24.1\% (incorrect label), 26.0\% (1-shot RL), and 27.1\% (majority voting) -- nearly matching the 29.1\% gained with ground truth rewards. However, the spurious rewards that work for Qwen often fail to yield gains with other model families like Llama3 or OLMo2. In particular, we find code reasoning -- thinking in code without actual code execution -- to be a distinctive Qwen2.5-Math behavior that becomes significantly more frequent after RLVR, from 65\% to over 90\%, even with spurious rewards. Overall, we hypothesize that, given the lack of useful reward signal, RLVR must somehow be surfacing useful reasoning representations learned during pretraining, although the exact mechanism remains a topic for future work. We suggest that future RLVR research should possibly be validated on diverse models rather than a single de facto choice, as we show that it is easy to get significant performance gains on Qwen models even with completely spurious reward signals.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,paper: coding_age_ai},
  file = {/home/drezil/Zotero/storage/B6YD7BAS/Shao et al. - 2025 - Spurious Rewards Rethinking Training Signals in RLVR.pdf;/home/drezil/Zotero/storage/34GN67VD/2506.html}
}
% == BibLateX quality report for Shao2025SpuriousRewardsRethinking:
% ? Title looks like it was stored in title-case in Zotero
% ? unused Number ("arXiv:2506.10947")

@online{Yue2025DoesReinforcementLearning,
  title = {Does {{Reinforcement Learning Really Incentivize Reasoning Capacity}} in {{LLMs Beyond}} the {{Base Model}}?},
  author = {Yue, Yang and Chen, Zhiqi and Lu, Rui and Zhao, Andrew and Wang, Zhaokai and Yue, Yang and Song, Shiji and Huang, Gao},
  date = {2025-05-16},
  eprint = {2504.13837},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2504.13837},
  url = {http://arxiv.org/abs/2504.13837},
  urldate = {2025-06-26},
  abstract = {Reinforcement Learning with Verifiable Rewards (RLVR) has recently demonstrated notable success in enhancing the reasoning performance of large language models (LLMs), particularly on mathematics and programming tasks. Similar to how traditional RL helps agents explore and learn new strategies, RLVR is believed to enable LLMs to continuously self-improve, thus acquiring novel reasoning abilities beyond those of the corresponding base models. In this study we critically examine the current state of RLVR by systematically probing the reasoning capability boundaries of RLVR-trained LLMs across various model families, RL algorithms, and math, coding, and visual reasoning benchmarks, using pass@k at large k values as the evaluation metric. Surprisingly, we find that the current training setup does not elicit fundamentally new reasoning patterns. While RLVR-trained models outperform their base models at small k (e.g., k = 1), the base models achieve a higher pass@k score when k is large. Coverage and perplexity analyses show that the observed reasoning abilities originate from and are bounded by the base model. Treating the base model as an upper bound, our quantitative analysis shows that six popular RLVR algorithms perform similarly and remain far from optimal in leveraging the potential of the base model. By contrast, we find that distillation can introduce new reasoning patterns from the teacher and genuinely expand the model's reasoning capabilities. Overall, our findings suggest that current RLVR methods have not yet realized the potential of RL to elicit truly novel reasoning abilities in LLMs. This highlights the need for improved RL paradigms, such as continual scaling and multi-turn agent-environment interaction, to unlock this potential.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,paper: coding_age_ai},
  file = {/home/drezil/Zotero/storage/5497LRFF/Yue et al. - 2025 - Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model.pdf;/home/drezil/Zotero/storage/V6UEYR9C/2504.html}
}
% == BibLateX quality report for Yue2025DoesReinforcementLearning:
% ? Title looks like it was stored in title-case in Zotero
% ? unused Number ("arXiv:2504.13837")