diff --git a/Writing/coding-age-ai.bib b/Writing/coding-age-ai.bib new file mode 100644 index 0000000..0e8ef41 --- /dev/null +++ b/Writing/coding-age-ai.bib @@ -0,0 +1,115 @@ +@video{bycloud2025LLMsRLRevelation, + entrysubtype = {video}, + title = {The {{LLM}}'s {{RL Revelation We Didn}}'t {{See Coming}}}, + editor = {{bycloud}}, + editortype = {director}, + date = {2025-06-24}, + url = {https://www.youtube.com/watch?v=z3awgfU4yno}, + urldate = {2025-06-26}, + keywords = {paper: coding_age_ai} +} +% == BibLateX quality report for bycloud2025LLMsRLRevelation: +% Unexpected field 'title' +% Unexpected field 'editor' +% Unexpected field 'editortype' +% ? Title looks like it was stored in title-case in Zotero +% ? unused Library catalog ("YouTube") +% ? unused Running time ("15:33") + +@online{DeepSeek-AI2025DeepSeekR1IncentivizingReasoning, + title = {{{DeepSeek-R1}}: {{Incentivizing Reasoning Capability}} in {{LLMs}} via {{Reinforcement Learning}}}, + shorttitle = {{{DeepSeek-R1}}}, + author = {DeepSeek-AI and Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and Zhang, Xiaokang and Yu, Xingkai and Wu, Yu and Wu, Z. F. and Gou, Zhibin and Shao, Zhihong and Li, Zhuoshu and Gao, Ziyi and Liu, Aixin and Xue, Bing and Wang, Bingxuan and Wu, Bochao and Feng, Bei and Lu, Chengda and Zhao, Chenggang and Deng, Chengqi and Zhang, Chenyu and Ruan, Chong and Dai, Damai and Chen, Deli and Ji, Dongjie and Li, Erhang and Lin, Fangyun and Dai, Fucong and Luo, Fuli and Hao, Guangbo and Chen, Guanting and Li, Guowei and Zhang, H. and Bao, Han and Xu, Hanwei and Wang, Haocheng and Ding, Honghui and Xin, Huajian and Gao, Huazuo and Qu, Hui and Li, Hui and Guo, Jianzhong and Li, Jiashi and Wang, Jiawei and Chen, Jingchang and Yuan, Jingyang and Qiu, Junjie and Li, Junlong and Cai, J. L. and Ni, Jiaqi and Liang, Jian and Chen, Jin and Dong, Kai and Hu, Kai and Gao, Kaige and Guan, Kang and Huang, Kexin and Yu, Kuai and Wang, Lean and Zhang, Lecong and Zhao, Liang and Wang, Litong and Zhang, Liyue and Xu, Lei and Xia, Leyi and Zhang, Mingchuan and Zhang, Minghua and Tang, Minghui and Li, Meng and Wang, Miaojun and Li, Mingming and Tian, Ning and Huang, Panpan and Zhang, Peng and Wang, Qiancheng and Chen, Qinyu and Du, Qiushi and Ge, Ruiqi and Zhang, Ruisong and Pan, Ruizhe and Wang, Runji and Chen, R. J. and Jin, R. L. and Chen, Ruyi and Lu, Shanghao and Zhou, Shangyan and Chen, Shanhuang and Ye, Shengfeng and Wang, Shiyu and Yu, Shuiping and Zhou, Shunfeng and Pan, Shuting and Li, S. S. and Zhou, Shuang and Wu, Shaoqing and Ye, Shengfeng and Yun, Tao and Pei, Tian and Sun, Tianyu and Wang, T. and Zeng, Wangding and Zhao, Wanjia and Liu, Wen and Liang, Wenfeng and Gao, Wenjun and Yu, Wenqin and Zhang, Wentao and Xiao, W. L. and An, Wei and Liu, Xiaodong and Wang, Xiaohan and Chen, Xiaokang and Nie, Xiaotao and Cheng, Xin and Liu, Xin and Xie, Xin and Liu, Xingchao and Yang, Xinyu and Li, Xinyuan and Su, Xuecheng and Lin, Xuheng and Li, X. Q. and Jin, Xiangyue and Shen, Xiaojin and Chen, Xiaosha and Sun, Xiaowen and Wang, Xiaoxiang and Song, Xinnan and Zhou, Xinyi and Wang, Xianzu and Shan, Xinxia and Li, Y. K. and Wang, Y. Q. and Wei, Y. X. and Zhang, Yang and Xu, Yanhong and Li, Yao and Zhao, Yao and Sun, Yaofeng and Wang, Yaohui and Yu, Yi and Zhang, Yichao and Shi, Yifan and Xiong, Yiliang and He, Ying and Piao, Yishi and Wang, Yisong and Tan, Yixuan and Ma, Yiyang and Liu, Yiyuan and Guo, Yongqiang and Ou, Yuan and Wang, Yuduan and Gong, Yue and Zou, Yuheng and He, Yujia and Xiong, Yunfan and Luo, Yuxiang and You, Yuxiang and Liu, Yuxuan and Zhou, Yuyang and Zhu, Y. X. and Xu, Yanhong and Huang, Yanping and Li, Yaohui and Zheng, Yi and Zhu, Yuchen and Ma, Yunxian and Tang, Ying and Zha, Yukun and Yan, Yuting and Ren, Z. Z. and Ren, Zehui and Sha, Zhangli and Fu, Zhe and Xu, Zhean and Xie, Zhenda and Zhang, Zhengyan and Hao, Zhewen and Ma, Zhicheng and Yan, Zhigang and Wu, Zhiyu and Gu, Zihui and Zhu, Zijia and Liu, Zijun and Li, Zilin and Xie, Ziwei and Song, Ziyang and Pan, Zizheng and Huang, Zhen and Xu, Zhipeng and Zhang, Zhongyu and Zhang, Zhen}, + date = {2025-01-22}, + eprint = {2501.12948}, + eprinttype = {arXiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2501.12948}, + url = {http://arxiv.org/abs/2501.12948}, + urldate = {2025-06-26}, + abstract = {We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.}, + pubstate = {prepublished}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,paper: coding_age_ai}, + file = {/home/drezil/Zotero/storage/5Z5755Y9/DeepSeek-AI et al. - 2025 - DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.pdf;/home/drezil/Zotero/storage/SDRBNKQ3/2501.html} +} +% == BibLateX quality report for DeepSeek-AI2025DeepSeekR1IncentivizingReasoning: +% ? Title looks like it was stored in title-case in Zotero +% ? unused Number ("arXiv:2501.12948") + +@online{Liu2025UnderstandingR1ZeroLikeTraining, + title = {Understanding {{R1-Zero-Like Training}}: {{A Critical Perspective}}}, + shorttitle = {Understanding {{R1-Zero-Like Training}}}, + author = {Liu, Zichen and Chen, Changyu and Li, Wenjun and Qi, Penghui and Pang, Tianyu and Du, Chao and Lee, Wee Sun and Lin, Min}, + date = {2025-03-26}, + eprint = {2503.20783}, + eprinttype = {arXiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2503.20783}, + url = {http://arxiv.org/abs/2503.20783}, + urldate = {2025-06-26}, + abstract = {DeepSeek-R1-Zero has shown that reinforcement learning (RL) at scale can directly enhance the reasoning capabilities of LLMs without supervised fine-tuning. In this work, we critically examine R1-Zero-like training by analyzing its two core components: base models and RL. We investigate a wide range of base models, including DeepSeek-V3-Base, to understand how pretraining characteristics influence RL performance. Our analysis reveals that DeepSeek-V3-Base already exhibit ''Aha moment'', while Qwen2.5 base models demonstrate strong reasoning capabilities even without prompt templates, suggesting potential pretraining biases. Additionally, we identify an optimization bias in Group Relative Policy Optimization (GRPO), which artificially increases response length (especially for incorrect outputs) during training. To address this, we introduce Dr. GRPO, an unbiased optimization method that improves token efficiency while maintaining reasoning performance. Leveraging these insights, we present a minimalist R1-Zero recipe that achieves 43.3\% accuracy on AIME 2024 with a 7B base model, establishing a new state-of-the-art. Our code is available at https://github.com/sail-sg/understand-r1-zero.}, + pubstate = {prepublished}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,paper: coding_age_ai}, + file = {/home/drezil/Zotero/storage/YFH83QF5/Liu et al. - 2025 - Understanding R1-Zero-Like Training A Critical Perspective.pdf;/home/drezil/Zotero/storage/VPK84FQT/2503.html} +} +% == BibLateX quality report for Liu2025UnderstandingR1ZeroLikeTraining: +% ? Title looks like it was stored in title-case in Zotero +% ? unused Number ("arXiv:2503.20783") + +@online{Mukherjee2025ReinforcementLearningFinetunes, + title = {Reinforcement {{Learning Finetunes Small Subnetworks}} in {{Large Language Models}}}, + author = {Mukherjee, Sagnik and Yuan, Lifan and Hakkani-Tur, Dilek and Peng, Hao}, + date = {2025-05-16}, + eprint = {2505.11711}, + eprinttype = {arXiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2505.11711}, + url = {http://arxiv.org/abs/2505.11711}, + urldate = {2025-06-26}, + abstract = {Reinforcement learning (RL) yields substantial improvements in large language models (LLMs) downstream task performance and alignment with human values. Surprisingly, such large gains result from updating only a small subnetwork comprising just 5 percent to 30 percent of the parameters, with the rest effectively unchanged. We refer to this phenomenon as parameter update sparsity induced by RL. It is observed across all 7 widely used RL algorithms (e.g., PPO, GRPO, DPO) and all 10 LLMs from different families in our experiments. This sparsity is intrinsic and occurs without any explicit sparsity promoting regularizations or architectural constraints. Finetuning the subnetwork alone recovers the test accuracy, and, remarkably, produces a model nearly identical to the one obtained via full finetuning. The subnetworks from different random seeds, training data, and even RL algorithms show substantially greater overlap than expected by chance. Our analysis suggests that this sparsity is not due to updating only a subset of layers, instead, nearly all parameter matrices receive similarly sparse updates. Moreover, the updates to almost all parameter matrices are nearly full-rank, suggesting RL updates a small subset of parameters that nevertheless span almost the full subspaces that the parameter matrices can represent. We conjecture that the this update sparsity can be primarily attributed to training on data that is near the policy distribution, techniques that encourage the policy to remain close to the pretrained model, such as the KL regularization and gradient clipping, have limited impact.}, + pubstate = {prepublished}, + keywords = {Computer Science - Machine Learning,paper: coding_age_ai}, + file = {/home/drezil/Zotero/storage/HFEQA4GW/Mukherjee et al. - 2025 - Reinforcement Learning Finetunes Small Subnetworks in Large Language Models.pdf;/home/drezil/Zotero/storage/QTJPABH6/2505.html} +} +% == BibLateX quality report for Mukherjee2025ReinforcementLearningFinetunes: +% ? Title looks like it was stored in title-case in Zotero +% ? unused Number ("arXiv:2505.11711") + +@online{Shao2025SpuriousRewardsRethinking, + title = {Spurious {{Rewards}}: {{Rethinking Training Signals}} in {{RLVR}}}, + shorttitle = {Spurious {{Rewards}}}, + author = {Shao, Rulin and Li, Shuyue Stella and Xin, Rui and Geng, Scott and Wang, Yiping and Oh, Sewoong and Du, Simon Shaolei and Lambert, Nathan and Min, Sewon and Krishna, Ranjay and Tsvetkov, Yulia and Hajishirzi, Hannaneh and Koh, Pang Wei and Zettlemoyer, Luke}, + date = {2025-06-12}, + eprint = {2506.10947}, + eprinttype = {arXiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2506.10947}, + url = {http://arxiv.org/abs/2506.10947}, + urldate = {2025-06-26}, + abstract = {We show that reinforcement learning with verifiable rewards (RLVR) can elicit strong mathematical reasoning in certain models even with spurious rewards that have little, no, or even negative correlation with the correct answer. For example, RLVR improves MATH-500 performance for Qwen2.5-Math-7B in absolute points by 21.4\% (random reward), 13.8\% (format reward), 24.1\% (incorrect label), 26.0\% (1-shot RL), and 27.1\% (majority voting) -- nearly matching the 29.1\% gained with ground truth rewards. However, the spurious rewards that work for Qwen often fail to yield gains with other model families like Llama3 or OLMo2. In particular, we find code reasoning -- thinking in code without actual code execution -- to be a distinctive Qwen2.5-Math behavior that becomes significantly more frequent after RLVR, from 65\% to over 90\%, even with spurious rewards. Overall, we hypothesize that, given the lack of useful reward signal, RLVR must somehow be surfacing useful reasoning representations learned during pretraining, although the exact mechanism remains a topic for future work. We suggest that future RLVR research should possibly be validated on diverse models rather than a single de facto choice, as we show that it is easy to get significant performance gains on Qwen models even with completely spurious reward signals.}, + pubstate = {prepublished}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,paper: coding_age_ai}, + file = {/home/drezil/Zotero/storage/B6YD7BAS/Shao et al. - 2025 - Spurious Rewards Rethinking Training Signals in RLVR.pdf;/home/drezil/Zotero/storage/34GN67VD/2506.html} +} +% == BibLateX quality report for Shao2025SpuriousRewardsRethinking: +% ? Title looks like it was stored in title-case in Zotero +% ? unused Number ("arXiv:2506.10947") + +@online{Yue2025DoesReinforcementLearning, + title = {Does {{Reinforcement Learning Really Incentivize Reasoning Capacity}} in {{LLMs Beyond}} the {{Base Model}}?}, + author = {Yue, Yang and Chen, Zhiqi and Lu, Rui and Zhao, Andrew and Wang, Zhaokai and Yue, Yang and Song, Shiji and Huang, Gao}, + date = {2025-05-16}, + eprint = {2504.13837}, + eprinttype = {arXiv}, + eprintclass = {cs}, + doi = {10.48550/arXiv.2504.13837}, + url = {http://arxiv.org/abs/2504.13837}, + urldate = {2025-06-26}, + abstract = {Reinforcement Learning with Verifiable Rewards (RLVR) has recently demonstrated notable success in enhancing the reasoning performance of large language models (LLMs), particularly on mathematics and programming tasks. Similar to how traditional RL helps agents explore and learn new strategies, RLVR is believed to enable LLMs to continuously self-improve, thus acquiring novel reasoning abilities beyond those of the corresponding base models. In this study we critically examine the current state of RLVR by systematically probing the reasoning capability boundaries of RLVR-trained LLMs across various model families, RL algorithms, and math, coding, and visual reasoning benchmarks, using pass@k at large k values as the evaluation metric. Surprisingly, we find that the current training setup does not elicit fundamentally new reasoning patterns. While RLVR-trained models outperform their base models at small k (e.g., k = 1), the base models achieve a higher pass@k score when k is large. Coverage and perplexity analyses show that the observed reasoning abilities originate from and are bounded by the base model. Treating the base model as an upper bound, our quantitative analysis shows that six popular RLVR algorithms perform similarly and remain far from optimal in leveraging the potential of the base model. By contrast, we find that distillation can introduce new reasoning patterns from the teacher and genuinely expand the model's reasoning capabilities. Overall, our findings suggest that current RLVR methods have not yet realized the potential of RL to elicit truly novel reasoning abilities in LLMs. This highlights the need for improved RL paradigms, such as continual scaling and multi-turn agent-environment interaction, to unlock this potential.}, + pubstate = {prepublished}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,paper: coding_age_ai}, + file = {/home/drezil/Zotero/storage/5497LRFF/Yue et al. - 2025 - Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model.pdf;/home/drezil/Zotero/storage/V6UEYR9C/2504.html} +} +% == BibLateX quality report for Yue2025DoesReinforcementLearning: +% ? Title looks like it was stored in title-case in Zotero +% ? unused Number ("arXiv:2504.13837") diff --git a/Writing/coding-age-ai.md b/Writing/coding-age-ai.md new file mode 100644 index 0000000..02e60c9 --- /dev/null +++ b/Writing/coding-age-ai.md @@ -0,0 +1,1235 @@ +--- +tags: + - Writing +cssclasses: + - table-wide + - table-wrap +title: "Field Report: Coding in the Age of AI with Cursor" +description: | + A real Field Report out of the trences of AI-assisted coding with its ups and downs. +abstract: | + This report presents practical methodologies and best practices for developing software using Cursor, an AI-assisted IDE. The paper details a structured workflow that emphasizes specification-driven development, comprehensive documentation practices, and systematic task management to maintain quality control when working with language models. Through detailed examples and rule configurations, it demonstrates how to leverage AI capabilities while mitigating common pitfalls such as redundant code generation and context limitations. The methodology presented includes comprehensive PRD (Product Requirement Document) creation, specification adherence checks, and task breakdown systems that ensure alignment between human intent and AI execution. This guide serves as a practical reference for developers seeking to effectively integrate AI tools into their software development workflow while maintaining project integrity and scalability. + +lang: de +date: 2025-06-26 +authors: + - name: Nicole Dresselhaus + affiliation: + - name: Humboldt-Universität zu Berlin + url: https://hu-berlin.de + email: nicole.dresselhaus@hu-berlin.de + correspondence: true + orcid: 0009-0008-8850-3679 + roles: + - Conceptualization + - Investigation + - Validation + - "Writing – original draft" + - "Writing – review & editing" +citation: true +google-scholar: true +fileClass: authored +bibliography: + - coding-age-ai.bib +citation-style: springer-humanities-brackets +image: ../thumbs/writing_coding-age-ai.png +format: + html: + other-links: + - text: Source of this article + icon: filetype-md + href: https://gitea.dresselhaus.cloud/Drezil/quarto/src/branch/main/Writing/coding-age-ai.md + - text: Documentation for gitlab_overviewer + icon: filetype-doc + href: https://pages.cms.hu-berlin.de/methodenlabor/p_gitlab-overviewer/ + citation-location: document + reference-location: margin # fußnoten im Margin (falls gewünscht) + mermaid: + theme: default +--- + +## Introduction + +In the fast eveolving field of AI there is a clear lack of reports on "what +really works". Some techniques hailed as revolution (like the DeepSeek +Aha-Moment[@DeepSeek-AI2025DeepSeekR1IncentivizingReasoning]) for unlimited +potential were soon realized to "just" optimize nieche problems that can +benchmarked[@Shao2025SpuriousRewardsRethinking]^[Like all decent humans i ain't +got time to read up on everything - so a big shoutout to +[@bycloud2025LLMsRLRevelation] for doing thorough reviews on ML-topics and +linking the respective papers!]. + +I personally think it is an exercise in futility to get a _current_ theoretical +overview for forming a decent grounded opinion on the state of things. Even +before one is done analyzing the literature, crossrefercencing and collecting +evidence and then finally formulating methods and implementing them, the next +revolution comes around that could put everything on its head again. In the +afromentioned example the community went from "Reasoning is the solution" in +January[@DeepSeek-AI2025DeepSeekR1IncentivizingReasoning] over first critical +views in March[@Liu2025UnderstandingR1ZeroLikeTraining] to doubts on that claims +validity of generating concepts previously not present in the base model in +May[@Mukherjee2025ReinforcementLearningFinetunes] to complete ad-absurdum in +June[@Shao2025SpuriousRewardsRethinking]^[i.e. the "base model" nearly all +papers tested their finding on (qwen-series) also gets better with +RLVR-optimization if rewards are random instead of verified]. + +Therefore see this "Field Guide" for what it is: A current state of things that +work for at least 1 individuum in exactly this ecosystem at this point in time. + +## How to program with Cursor + +In essence [Cursor](https://cursor.com) is "just" a fork of +[Microsofts VSCode](https://code.visualstudio.com/) with some added +functionality: Automatically injecting files into LLM-Prompts, offering +tool-aware LLMs to use [MCP](https://modelcontextprotocol.io/introduction)s, +read the filesystem, execute arbitrary commands in the shell (either +automomatically or after permission), getting feedback from the editor (i.e. +installed linters, language-servers etc.) and thus have the same (or even +better) information/tools available as the programmer in front of the screen. + +### Capabilities / General procedure + +The main issue is now: theoretically agentic IDEs can get all information - +practically it is limited directly by token-window sizes, costs of these +queries; and indirectly by outsourced costs like environmental impacts, +data-security, etc. The suppliers of such services can claim privacy as much as +they want - it can't be proven and (especially under US-Law) is not even +possible to resist lawful orders (including the gag-order to not talk about +these). + +In practise one feels the direct pain points more severly. Some regular examples +include generating redundant code, because the current context was not aware of +utility-modules and functions it could use - leading to huge technical debt in +no time. + +Therefore my preferred workflow is to "think bigger". Imagine being a product +owner of a huge, sluggish company. The left hand never knows what the right hand +does or has done (i.e. the LLM forgetting things already exist in the codebase), +everything has to be rigorous defined, specified and enforced. Some people +reported good results with Test-Driven-Development (TDD) - but in my experience +these things only prevent regressions and not proactively enforce the desired +agent behaviour. + +### Lessons from Project Management + +This may be a _duh!_-Moment for people longer in Software-Engineering, but most +of the time is getting the specifications of what you want to build right. +Asking questions. Interviewing stakeholders. Gathering user experience reports. +Most of it is not actually writing code - but deciding on what to write and how. + +For this i created some rules in my workflow that interleave with their +responsibilities and outcomes. Especially in the planning phase the LLM is +encouraged to ask questions, find edge-cases or even look at other +implementations. One could also think of letting the agent do a websearch, read +reports and forums on how good which framework works and then arguments on why +this framework is good - and also why it is not good. The last decision of all +pro/contra is by the actual human. + +The main theme always follows a similar pattern: + +- A **need** is found. + This could be a bug, a feature, some changes to existing behaviour etc. +- An investigation is launched, yielding a **Product Requirement Document + (PRD)**. + This dives into the codebase to asses the current state of things. Maybe some + bugs are obvious and easily fixed. + This formalizes that the LLM understood what _should_ be done and especially + what is _out of scope_. +- Pin the desired behaviour in a **Specification**. + Either this means changing currently established specifications (i.e. + bug/chang) or writing complete new ones (i.e. feature). +- Investigate **Spec-Compliance**. + Again the agentlooks at the codebase to identify _where_ things should change + and _how_. Also recommendation are made on how it could achieve the goal. +- Generate **Tasks**. + From the compliance-report of spec-deviations (either from a bug or from a + new/changed spec) finally a Plan to fix everything can be derived (think: + Sprint-Planning). +- **NOTE: Up to here the agent never touched the code.** +- Finally **Implement the change**. + This is most often the most trivial step. Everything is known and formulated + for "simple" Agents to just follow. It names files needed, specs to follow, + guidelines on how to do things in the repository etc. + +## Implementation + +I only go briefly over the most important aspects of the implementation in Rules +and their results. As this is also evolving with experience, there will be +updates to those on the published place. + +### The Rules + +Cursor rules are written in markdown with a small yaml-frontmatter. +`description` is a string providing the agent with a description of the rule so +it can be called like a tool (and basically injects the rule-content into the +context). `globs` automatically inject the rule when the glob matches one +attached filename. `alwaysApply` injects the rule into all contexts-windows. + +#### Always apply rule: Basic info and behaviour + +This tells the agent the project-structure and fixes some common pitfalls (i.e. +the agent assuming `bash`, getting sidetracked, etc.). + +::: {.column-page-right} + +```markdown +--- +description: +globs: +alwaysApply: true +--- + +# IMPORTANT INFORMATION! DO NOT FORGET! + +## Conventions + +- Run tests with `poetry run pytest`. +- **NO BASH IDIOMS IN COMMANDS**. Do not assume bash-usage. Use temporary + scripts instead of bash-idoms like `<<` for text-input to a process. +- Ask to update corresponding spec when you detect or implement a behaviour + change. +- Mention every time things are not implemented according to spec and offer to + make them compliant. + +## Behavior of the Agent + +- If you have a task, ONLY do the task. Do not get distracted by failing tests, + missing data etc. not related to your task. +- Spec compliance is key. Check corresponding specs before you fix behaviour or + ask the user if you should proceed. + +## Where to look for information? + +- Status & changes: `git status`, test failures in `tests/` +- Active feature work: `/tasks//TASKS.md` +- Feature requirements: `/tasks//PRD.md` +- Feature specifications: `/specs/` +- Source code: `/src/` +- Tests & fixtures: `/tests/` +- CLI entry point: `/src//cli/__main__.py` +``` + +::: + +#### Get the PRD + +::: {.column-page-right} + +```markdown +--- +description: +globs: +alwaysApply: false +--- + +## Product Requirements Document (PRD) + +### Purpose + +Draft a concise, implementation‑ready Product Requirements Document (PRD) from a +one‑sentence feature description plus any additional Q&A with the stakeholder. + +### Output + +• Create /tasks//PRD.md • Markdown only – no prose, no code‑fences. • +File structure: # ## 1. Problem / Motivation ## 2. Goals ## 3. +Non‑Goals ## 4. Target Users & Personas ## 5. User Stories (Gherkin +“Given/When/Then”) ## 6. Acceptance Criteria ## 7. Technical Notes / +Dependencies ## 8. Open Questions + +### Process + +1. Stakeholder provides a single‑sentence feature idea and invokes this rule. +2. Look at specifications in `specs/` and inspect the code if needed to get an + idea what the Stakeholder expects from this feature. +3. Ask up to five clarifying questions (Q1 … Q5). If anything is still vague + after five, look into the project with the new information provided. You may + ask for further clarification up to 3 times following this schema, else flag + it in _Open Questions_. +4. After questions are answered reply exactly: Ready to generate the PRD. +5. On a user message that contains only the word "go" (case‑insensitive): • + Generate /tasks//PRD.md following _Output_ spec. • Reply: + /PRD.md created – review it. +6. STOP. Do **not** generate tasks or code in this rule. + +### Writing guidelines + +• Keep each bullet ≤120 characters. • Use action verbs and measurable language. +• Leave TBDs only in _Open Questions_. • No business fluff – pretend the reader +is a junior developer. + +### Safety rails + +• Assume all work happens in a non‑production environment, unless otherwise +stated or requested by you. • Do not include sensitive data or credentials in +the PRD. • Check the generated Document with `markdownlint` (if available), +apply auto-fixes and fix the remaining issues manually. +``` + +::: + +A call to this rule usually looks like +`@generate-prd We noticed, that …. Therefore investigate the codebase to come up with a PRD addressing these issues.`. + +#### Specifications + +::: {.column-page-right} + +````markdown +--- +description: Specification Writing Guidelines +globs: +alwaysApply: false +--- + +# Specification Writing Guidelines + +## Overview + +This rule provides guidelines for writing and maintaining specifications in +[specs/](mdc:specs) to ensure consistency, clarity, and prevent implementation +discrepancies. + +## Specification Structure + +### Required Sections + +Every specification should include: + +1. **Title and Purpose** + + ```markdown + # Specification: [Component Name] + + Brief description of what this specification covers and its purpose. + ``` + +2. **Scope and Boundaries** + + - What is included/excluded + - Dependencies on other specifications + - Relationship to other components + +3. **Detailed Requirements** + + - Structured by logical sections + - Clear, unambiguous language + - Examples where helpful + +4. **Error Handling** + + - How errors should be handled + - Fallback behaviors + - Edge cases + +5. **Testing Requirements** + - Expected test coverage + - Snapshot requirements + - Approval test criteria + +## Writing Standards + +### Clarity and Precision + +- **Use specific language**: Avoid vague terms like "should" or "might" +- **Provide examples**: Include concrete examples for complex requirements +- **Define terms**: Clearly define any technical terms or concepts +- **Use consistent formatting**: Follow established patterns from existing specs + +### Structure and Organization + +- **Logical flow**: Organize sections in logical order +- **Consistent headings**: Use consistent heading levels and naming +- **Cross-references**: Link to related specifications using + `[spec_name](mdc:specs/spec_name.md)` +- **Code blocks**: Use appropriate language tags for code examples + +### Completeness + +- **Cover all cases**: Address normal, error, and edge cases +- **Be exhaustive**: Don't assume implementation details +- **Consider interactions**: How this spec relates to others +- **Future-proof**: Consider potential changes and extensions + +## Specification Maintenance + +### Version Control + +- **Update specs first**: When changing behavior, update spec before + implementation +- **Document changes**: Use clear commit messages explaining spec changes +- **Review process**: Have specs reviewed before implementation + +### Consistency Checks + +- **Cross-reference validation**: Ensure all links to other specs are valid +- **Terminology consistency**: Use consistent terms across all specs +- **Format consistency**: Follow established formatting patterns + +### Testing Integration + +- **Spec-driven tests**: Write tests based on specification requirements +- **Snapshot validation**: Ensure snapshots match specification exactly +- **Approval tests**: Use approval tests to catch spec violations + +## Quality Checklist + +### Before Finalizing Specification + +- [ ] All requirements clearly stated +- [ ] Examples provided for complex requirements +- [ ] Error cases covered +- [ ] Cross-references to other specs included +- [ ] Out of scope items clearly defined +- [ ] Testing requirements specified +- [ ] Consistent formatting throughout +- [ ] Check the generated Document with `markdownlint` (if available), apply + auto-fixes and fix the remaining issues manually. + +### Review Criteria + +- [ ] Is the specification unambiguous? +- [ ] Are all edge cases covered? +- [ ] Does it integrate well with other specs? +- [ ] Is it testable? +- [ ] Is it maintainable? + +## Common Pitfalls to Avoid + +### Ambiguity + +- **Vague language**: "The system should handle errors gracefully" +- **Missing details**: Not specifying exact error handling behavior +- **Unclear relationships**: Not explaining how components interact + +### Inconsistency + +- **Different terms**: Using different terms for the same concept +- **Inconsistent formatting**: Not following established patterns +- **Conflicting requirements**: Requirements that contradict other specs + +### Incompleteness + +- **Missing edge cases**: Not considering unusual scenarios +- **Incomplete examples**: Examples that don't cover all cases +- **Missing error handling**: Not specifying what happens when things go wrong + +## Related Rules + +- [spec-compliance-investigation.mdc](mdc:.cursor/rules/spec-compliance-investigation.mdc) + How to investigate spec-implementation discrepancies +- [base_overview.mdc](mdc:.cursor/rules/base_overview.mdc) Project structure and + conventions +```` + +::: + +As it is obvious this is a very intricate rule with many criteria. For this you +really **need** a reasoning and deep-thinking model that can also reason for +extended times (many minutes are normal!) and call tools every now and then to +get even more information. Models like `o3`, `deepseek-r1` and the `opus`-series +of `claude` really shine here. + +#### Spec Compliance + +::: {.column-page-right} + +````markdown +--- +description: Spec Compliance Investigation Guide +globs: +alwaysApply: false +--- + +# Spec Compliance Investigation Guide + +## Overview + +This rule provides a systematic approach for investigating discrepancies between +specifications and implementations, following the pattern established. Do not +change any code during this phase. + +## Investigation Process + +### 1. Initial Analysis + +- **Locate specification**: Find the relevant spec file in [specs/](mdc:specs) +- **Identify implementation**: Find corresponding source code in [src/](mdc:src) +- **Check tests**: Review test files in [tests/](mdc:tests) for expected + behavior +- **Run tests**: Execute `poetry run pytest` to identify current failures + +### 2. Systematic Comparison + +For each specification section: + +1. **Extract requirements** from spec file +2. **Examine implementation** in source code +3. **Compare outputs** with test snapshots +4. **Document discrepancies** with specific examples + +### 3. Documentation Structure + +Create analysis document in [tmp/spec\_[component]\_discrepancies.md](mdc:tmp/) +with: + +```markdown +# [Component] Specification vs Implementation Discrepancies + +## Executive Summary + +Brief overview of findings and impact + +## Key Discrepancies Found + +### 1. [Category] - [Specific Issue] + +**Specification:** + +- Requirement details + +**Implementation:** + +- Current behavior +- ✅ Correct aspects +- ❌ Incorrect aspects + +## Test Results + +- Current test failures +- Output differences + +## Impact Assessment + +### High Impact Issues: + +- Critical functionality problems +- User experience issues + +### Medium Impact Issues: + +- Consistency problems +- Formatting issues + +### Low Impact Issues: + +- Minor differences +- Style variations + +## Recommendations + +### Option 1: Update Spec to Follow Code + +**What to change:** + +- Specific spec modifications + +**Pros:** + +- Benefits of this approach + +**Cons:** + +- Drawbacks of this approach + +### Option 2: Update Code to Follow Spec + +**What to change:** + +- Specific code modifications + +**Pros:** + +- Benefits of this approach + +**Cons:** + +- Drawbacks of this approach + +### Option 3: Recommended Hybrid Approach + +**Recommended Solution:** + +- Phased implementation plan + +**Rationale:** + +- Why this approach is best + +**Implementation Priority:** + +- Immediate, short-term, medium-term tasks +``` + +## Quality Checklist + +### Before Finalizing Investigation + +- [ ] All specification sections reviewed +- [ ] Implementation code thoroughly examined +- [ ] Tests run and failures documented +- [ ] Impact assessment completed +- [ ] All three solution options evaluated +- [ ] Recommendation justified with rationale +- [ ] Implementation plan prioritized + +### Documentation Quality + +- [ ] Specific examples provided for each discrepancy +- [ ] Code snippets included where relevant +- [ ] Pros/cons clearly articulated +- [ ] Implementation steps detailed +- [ ] Priority levels assigned + +## Related Rules + +- [Base Project Overview](mdc:.cursor/rules/base-project-overview.mdc) Project + structure and conventions +- [Spec Guidelines](mdc:.cursor/rules/spec-guidelines.mdc) How to write + specifications +```` + +::: + +This compliance-report also need a deep-thinking model, like the specification +beforehand. + +#### Tasks + +::: {.column-page-right} + +```markdown +--- +description: +globs: +alwaysApply: false +--- + +## Task List creation + +### Purpose + +Translate a Product Requirements Document (PRD) into an executable Markdown task +list that a junior developer (human or AI) can follow without extra context. + +### Output + +• Create /tasks//TASKS.md (overwrite if it exists). • Markdown only, no +prose around it. • Epics = H2 headings (`## 1. `). • Tasks = unchecked +check‑boxes (`- [ ] 1.1 `). • Sub‑tasks = indent one space under their +parent (` - [ ] 1.1.1 `). • Create a +/tasks//Task**\_.md (i.e. `Task_3_2_4.md` for Epic +3, Task 2, Subtask 4) + +### Process + +1. Read the tagged PRD. +2. **Investigate** the current state of the repository to collect answers to + your first questions. All specs for fixed behaviours and outputs are located + in `specs/`. **Consult those** as a source first before trying to + reverse-engineer from the code. + If specs are in need of change then this is also a task to be generated. +3. If critical info is missing and cannot be answered by looking at the code, + ask max five clarifying questions (Q1 … Q5) and stop until answered. +4. After questions are answered think about the answers and: Either: look at the + code again, then goto 3., and ask for further clarification Or: Reply + exactly: Ready to generate the subtasks – respond **go** to proceed. +5. On a user message that contains only the word "go" (case‑insensitive): a. + Generate /tasks//TASKS.md following _Output_ spec. b. Reply with: + TASKS.md created – review them. +6. After TASKS.md was reviewed, create `Task___.md` for each task and + subtask containing implementation hints like relevant specs (link them!), + primary files to edit/review for this task, tests needing change, etc. +7. Stop. Do **not** begin executing tasks in this rule. + +### Writing guidelines + +• Each item ≤120 characters, start with an action verb. • Hints are allowed +below each item as HTML-Comment and do not count against the 120 characters. • +Group related work into logical epics with ≤7 direct child items. • Prefer +concrete file paths, commands, specs or APIs when available. • Skip +implementation details obvious from the codebase in the overview. • If a task +only concerns up to 5 files, name them in the detailed file. Otherwise give +hints on how to search for them (i.e. "everything under src/models/"). + +### Safety rails + +• Never touch production data. • Assume all work happens in a feature branch, +never commit directly to main. • Check the generated Document with +`markdownlint` (if available), apply auto-fixes and fix the remaining issues +manually. +``` + +::: + +This also works better with one of those deep-thinking models. + +#### Other Rules + +I have some other rules with guidelines on how to write proper tests, one for +"just follow the tasks in TASKS.md one by one until done with commit after each +task", etc. Those are omitted for brevity. Also they are so simple, that +non-reasoning-models can follow them. Cheap `claude3-sonnet` or small, +specialised coding-LLMs are enough to get the job done with this preparation. + +## Example: Rules in Action + +The codebase we look at here is a project called `gitlab_overviewer`. It takes +gitlab-api-keys and generates nice overviews for tracking metadata in different +projects across different groups. With a nice export to markdown (for rendering +in gitlab itself) and quarto (for exporting to i.e. confluence) with multiple +pages etc. pp. + +The current issue is, that due to a complete rewrite we are happy with the +current output, but there are some minor problems still remaining. Some of them +may also due to specs being incorrect, because requirements or ideas changed +over time. + +### Analyzing the situation + +For the analysis `o3` was used and it was sufficient on first try. Here is (part +of) the conversation: + +> [!tip] **User** +> +> `@spec-compliance-investigation.mdc` do a spec-compliance investigation on +> `@spec_renderer_markdown.md` . The relevant modules in the `@/src` **should** +> link to the spec (omissions are possible and also out-of-spec). + +![The initial analysis with most thinking steps opened. It is interesting to see the agent collect needed information onthe first try and generates a lot of 'thought'.](images/initial_analysis.png) + +::: {.column-page-right} + +> [!note] **Cursor** +> +> tmp/spec_renderer_markdown_discrepancies.md: +> +> > # Markdown Renderer Specification vs Implementation Discrepancies +> > +> > ## Executive Summary +> > +> > The current `render_grouped_markdown` implementation provides most of the +> > layout and structural requirements defined in +> > `specs/spec_renderer_markdown.md`. However, several key aspects diverge from +> > the specification, leading to failing approval tests and user-visible +> > issues. The most critical problems concern ordering guarantees and the +> > content chosen for the _Description_ callout. +> > +> > ## Key Discrepancies Found +> > +> > ### 1. Ordering – Projects inside detailed sections +> > +> > **Specification:** +> > +> > - Projects in the detailed part _"appear in the same order they had inside +> > their group's summary table"_ (Spec §4, bullet 3). +> > +> > **Implementation:** +> > +> > ```python +> > # ... excerpt ... +> > for od in sorted(group_projects, key=lambda p: p.project.name.lower()): +> > # renders project sections alphabetically +> > ``` +> > +> > - Projects are sorted **alphabetically**, not in the original summary-table +> > order. +> > - ✅ Groups themselves are ordered by descending project count (matches +> > spec). +> > - ❌ Project order within groups breaks the “mirror the incoming, +> > already-sorted data” rule. +> > +> > ### 2. _Description_ callout picks raw front-matter if no `description` key exists +> > +> > **Specification:** +> > +> > - If front-matter contains a `description` key → use its value (callout-type +> > `tip`). +> > - Else if `Readme.content` is not empty → use that content (callout-type +> > `warning`). +> > - `Readme.content` is defined as the **first real paragraph without +> > front-matter** (see Model spec). +> > +> > **Implementation:** +> > +> > - Correctly checks for `.raw_frontmatter["description"]` first ⬆️. +> > - Then falls back to `od.readme.content` unconditionally. In practice many +> > fixture READMEs still include `--- …` YAML because the pre-processing step +> > did **not** strip it, so the callout now contains the entire front-matter +> > block – cf. failing snapshot diff lines starting with `> --- "author":`. +> > - ✅ Callout types (`tip`/`warning`/`danger`) are chosen per spec. +> > - ❌ Content for the warning variant violates the “no raw front-matter” +> > expectation and visual quality. +> > +> > 8\< ……… 8\< +> > +> > ## Test Results +> > +> > ``` +> > poetry run pytest -q → 5 failures / 119 passed +> > • tests/rendering/test_markdown_renderer_approval.py (output mismatch) +> > • tests/rendering/test_quarto_renderer_approval.py (cascade – shares ordering logic) +> > • tests/models/test_model_invariants.py (3 failures unrelated to renderer) +> > ``` +> > +> > The Markdown approval diff highlights exactly the two major discrepancies +> > above (alphabetical ordering & raw front-matter in callout). +> > +> > ## Impact Assessment +> > +> > ### High Impact Issues +> > +> > 1. **Incorrect project ordering** – breaks reproducibility guarantees and +> > user expectations; also propagates to Quarto renderer. +> > 2. **Raw YAML in description callout** – severely harms readability of +> > generated documentation. +> > +> > ### Medium Impact Issues +> > +> > - None identified besides cosmetic phrasing. +> > +> > ### Low Impact Issues +> > +> > - Minor stylistic deviations (e.g., spacing) appear compliant within +> > tolerance. +> > +> > ## Recommendations +> > +> > ### Option 1: Update Spec to Follow Code +> > +> > **What to change:** +> > +> > - Relax ordering requirement to alphabetical order. +> > - Allow raw front-matter YAML in description callout. +> > +> > **Pros:** No code changes; quick test green. +> > +> > **Cons:** Violates original design intent; exposes low-level data to +> > end-users; poor UX. +> > +> > ### Option 2: Update Code to Follow Spec (preferred) +> > +> > **What to change:** +> > +> > 1. Remove the `sorted(...)` call to preserve input ordering. +> > 2. Ensure `Readme.content` truly excludes YAML front-matter (use +> > `parse_readme` during data ingestion or strip before rendering). +> > +> > **Pros:** Restores spec fidelity; fixes approval tests; better documentation +> > quality. +> > +> > **Cons:** Requires code edit & possibly data-loading adjustments. +> > +> > ### Option 3: Hybrid Approach +> > +> > - Fix ordering in renderer. +> > - Keep current fallback but truncate front-matter to first paragraph until +> > improved extraction available. +> > +> > **Recommended Solution:** **Option 2** – small, well-scoped patches bring +> > implementation back in line with spec and tests. +> > +> > **Implementation Priority:** +> > +> > - Immediate: ordering fix & front-matter stripping. +> > - Short-term: add regression tests for ordering. +> > - Medium-term: audit other renderers (Quarto) for same issues. +> +> I have completed the investigation phase and produced a detailed report at +> +> ```text +> tmp/spec_renderer_markdown_discrepancies.md +> ``` +> +> Key take-aways: +> +> • Two core deviations from the Markdown-renderer spec were found +> +> 1. Project ordering in the detailed section is alphabetical instead of +> mirroring the summary order. +> 2. When a README lacks a `description:` key, the _Description_ call-out falls +> back to raw front-matter, so YAML blocks are rendered verbatim. +> +> • These issues are the root cause of the failing approval tests. +> +> • A concise impact assessment and three solution options are documented; the +> recommended path is to fix the code to match the specification. +> +> Let me know if you’d like to proceed with the recommended fixes or explore the +> alternate options. + +::: + +### Correcting the analysis + +By reading the analysis i soon noticed, that things were missing out of the +spec. So i added a bit of text: + +> [!warning] Commenting the result +> +> ### 1. Ordering – Projects inside detailed sections +> +> **Specification:** > \[…snip…\] +> +> - ❌ Project order within groups breaks the "mirror the incoming, +> already-sorted data" rule. +> +> **Comment:** +> +> - The implemented behaviour is using alphabetical order in the **detailed** +> section and in the table according to the sort defined in the +> `@spec_table_config` . The spec needs updating. +> +> ### 2. _Description_ callout picks raw front-matter if no `description` key exists +> +> **Specification:** > \[…snip…\] +> +> - ❌ Content for the warning variant violates the "no raw front-matter" +> expectation and visual quality. +> +> **Comment:** +> +> - This is a violation of spec. But additionally it should only render the +> first 10 lines of `Readme.content` and then add a `…continues…` if there are +> more lines. + +Armed with those correction i told the LLM to generate better recommendations, +that align with my goals. The LLM then deleted the whole recommendations-section +and wrote exactly what i wanted. Again `o3` was used for the slow and deep +thinking. + +> [!tip] **User** +> +> I added comments to the part that are not up to spec. Update the +> recommendations in `@spec_renderer_markdown_discrepancies.md` according to +> those comments. + +![Thoughts of the agent after i updated the comments. It always feels good to read that i was understood^[yaya .. i know. Antropomorphing an AI.] by the machine.](images/thoughts_after_commenting.png) + +::: {.column-page-right} + +> [!note] **Cursor** +> +> tmp/spec_renderer_markdown_discrepancies.md: +> +> > // ... existing code ... +> > +> > ## Recommendations +> > +> > ### Option A: Align Specification with Current Ordering (preferred) +> > +> > **What to change:** +> > +> > 1. Amend `spec_renderer_markdown.md` §4 to state that _projects in the +> > detailed section appear alphabetically by project name_. This legitimises +> > the current implementation and removes the ordering-related approval +> > failure. +> > 2. Keep the existing implementation for project ordering unchanged. +> > 3. **Change the renderer implementation for the _Description_ call-out**: +> > - Strip YAML front-matter from `Readme.content` (use `parse_readme`). +> > - Render **at most the first 10 non-empty lines** of the resulting text. +> > - If more lines remain, append a marker `…continues…` as the final line +> > inside the call-out. +> > 4. Add corresponding wording to the spec clarifying the 10-line truncation +> > rule. +> > +> > **Pros:** +> > +> > - Least intrusive to working behaviour users already rely on. +> > - Fixes readability issue without reordering content. +> > - Smaller diff in test fixtures (only call-out content). +> > +> > **Cons:** +> > +> > - Requires a spec update (but that is a documentation-only task). +> > +> > ### Option B: Keep Original Spec Ordering, Change Code +> > +> > **What to change:** +> > +> > 1. Remove `sorted()` so project order mirrors summary-table order. +> > 2. Apply the same _Description_ call-out improvements as in Option A. +> > +> > **Pros:** +> > +> > - Upholds original design intent of summary/detailed symmetry. +> > +> > **Cons:** +> > +> > - Generates a larger diff for existing artefacts; may surprise current +> > users. +> > +> > ### Recommended Solution +> > +> > Proceed with **Option A**: +> > +> > - Update the specification paragraph on ordering. +> > - Implement the 10-line truncation + `…continues…` logic in the renderer. +> > +> > ### Implementation Priority +> > +> > 1. Immediate: implement call-out truncation logic & adjust approval +> > snapshots. +> > 2. Immediate: update Markdown renderer spec for alphabetical ordering and +> > call-out truncation. +> > 3. Short-term: add regression tests for the new truncation rule. +> > 4. Medium-term: propagate the truncation rule to the Quarto renderer to keep +> > output parity. // ... existing code ... +> +> The “Recommendations” section has been rewritten to reflect your comments: +> +> 1. Ordering: keep the current alphabetical behaviour and adjust the spec +> instead. +> 2. Description call-out: implement a 10-line preview with a “…continues…” +> marker after stripping front-matter. +> +> These points are now captured as **Option A (preferred)**, with a revised +> implementation roadmap. +> +> Let me know when you’d like me to start on the code/spec changes. + +::: + +### Creating Tasks + +Now that we have a clear plan without touching any code, lets flesh out the +Details. + +> [!tip] **User** +> +> `@generate-tasks.mdc` generate Tasks for Option A of +> `@spec_renderer_markdown_discrepancies.md` under a feature called +> "markdown_rendering". + +::: {.column-page-right} + +> [!note] **Cursor** +> +> TASKS.md: +> +> > ## 1. Spec updates +> > +> > - [ ] 1.1 Amend alphabetical ordering rule in +> > `specs/spec_renderer_markdown.md` +> > - [ ] 1.2 Document 10-line truncation rule for _Description_ call-out +> > +> > ## 2. Markdown renderer implementation +> > +> > - [ ] 2.1 Strip YAML front-matter before rendering description +> > - [ ] 2.1.1 Invoke `parse_readme` to obtain cleaned content +> > - [ ] 2.1.2 Add unit test ensuring YAML is absent in call-out +> > - [ ] 2.2 Truncate description to first 10 lines with `…continues…` +> > - [ ] 2.2.1 Implement truncation logic in `render_grouped_markdown` +> > - [ ] 2.2.2 Update approval snapshot for Overview.md +> > - [ ] 2.3 Verify alphabetical ordering comment references spec +> > +> > ## 3. Quarto renderer alignment +> > +> > - [ ] 3.1 Port truncation & YAML-stripping logic to +> > `src/gitlab_overviewer/rendering/quarto.py` +> > - [ ] 3.2 Refresh Quarto renderer snapshot & tests +> > +> > ## 4. Tests & snapshots +> > +> > - [ ] 4.1 Update Markdown renderer approval snapshot after changes +> > - [ ] 4.2 Update Quarto renderer snapshot after changes +> > - [ ] 4.3 Add regression test for 10-line truncation rule +> > +> > ## 5. Documentation & housekeeping +> > +> > - [ ] 5.1 Run `markdownlint` and fix issues in updated docs/specs +> > - [ ] 5.2 Add entry to `CHANGELOG.md` under _Unreleased_ +> +> Task_1_1.md: +> +> > ### Task 1.1 – Amend alphabetical ordering rule in spec_renderer_markdown.md +> > +> > **Relevant spec:** `specs/spec_renderer_markdown.md` (§4 Ordering +> > Guarantees) +> > +> > **Goal:** Replace the bullet _"projects appear in the same order they had +> > inside their group's summary table"_ with _"projects appear alphabetically +> > by their name"_. +> > +> > **Primary files:** +> > +> > - `specs/spec_renderer_markdown.md` +> > +> > **Hints:** +> > +> > - Search for the existing ordering clause (§4 bullet 3) and update +> > accordingly. +> > - Run `markdownlint` after editing: +> > `npx markdownlint-cli2 specs/spec_renderer_markdown.md --fix`. +> +> Task_1_2.md: +> +> > ### Task 1.2 – Document 10-line truncation rule for Description call-out +> > +> > **Relevant spec:** `specs/spec_renderer_markdown.md` (§3 Project Detail +> > Sections) +> > +> > **Goal:** Add a subsection clarifying that when the _Description_ call-out +> > uses `Readme.content`, the renderer must: +> > +> > 1. Strip YAML front-matter. +> > 2. Render the first 10 non-empty lines. +> > 3. Append `…continues…` if more content was omitted. +> > +> > **Primary files:** +> > +> > - `specs/spec_renderer_markdown.md` +> > +> > **Hints:** +> > +> > - Place this clarification right after rule 3.1 in §3. +> > - Provide a small Markdown example block. +> > - Remember to update any cross-links. +> +> \[…further tasks omitted for brevity…\] + +::: + +### Executing + +With all that detailed instructions changes to the code are narrowly focused and +the changes are easily reviewable. Execution does not need any deep thinking and +can be done by a fast and cheap code-model like `gemini-2.5`, `claude3-sonnet`, +etc. + +They usually just loads all relevant files as context, work step-by-step, +verify, tick off the task and report back for review. + +> [!tip]**User** +> +> `@xxx-work-on-tasks.mdc` `@/markdown_rendering` Implement all Task*1*\\\*.md. + +::: {.column-page-right} + +> [!note] **Cursor** +> +> I'll help you implement the tasks from Task*1*\*.md in the markdown rendering +> feature. Let me first check what these tasks entail. +> +> Let me first check the current state of the spec file we need to modify. +> +> I'll now implement both tasks. Let's start with Task 1.1 - amending the +> alphabetical ordering rule. +> +> ```markdown +> // ... existing code ... +> +> ## 4. Ordering Guarantees +> +> - Groups are ordered by the number of Projects rendered (alphabetically on a +> tie). +> - Summary project order mirror the incoming, already-sorted data (see +> [Table Sorting](./spec_table_sorting.md)). +> - In the detailed part, projects appear alphabetically by their name. +> +> // ... existing code ... +> ``` +> +> Now let's implement Task 1.2 - documenting the 10-line truncation rule for the +> Description call-out. +> +> ````markdown +> // ... existing code ... +> +> Missing entries from the Project-Info are omitted. +> +> When using `Readme.content` in a Description call-out, the renderer must: +> +> ```markdown +> 1. Strip any YAML front-matter from the content +> 2. Include only the first 10 non-empty lines +> 3. If content was truncated, append "…continues…" on a new line +> ``` +> +> Example: +> +> ```markdown +> > [!warning] Description +> > +> > # My Project +> > +> > This is a great project that does many things. It has multiple features: +> > +> > - Feature 1 +> > - Feature 2 +> > - Feature 3 +> > - Feature 4 …continues… +> ``` +> +> 2. **Release badge** – image linking to `/-/releases`. // ... +> existing code ... +> ```` +> +> Let me run markdownlint as suggested in the task hints to ensure our changes +> meet the markdown style guidelines. +> +> I've completed both tasks: +> +> 1. Task 1.1: Updated the ordering rule in section 4 to specify that projects +> appear alphabetically by their name in the detailed part, replacing the +> previous rule about maintaining summary table order. +> 2. Task 1.2: Added documentation about the Description call-out truncation +> rules after section 3.1, including: +> - The three required steps (strip YAML front-matter, render first 10 +> non-empty lines, append "...continues...") +> - A clear example showing how the truncation should look +> +> The markdownlint output shows some line length issues and a heading issue, but +> these appear to be pre-existing in the file and not related to our changes. +> The changes we made maintain the existing formatting style of the document. +> +> Would you like me to address any of the markdownlint issues, or would you +> prefer to keep the changes focused on just implementing the two tasks? + +::: + +And within 1 Minute everything was updated as expected and verified within a few +seconds. Here you can really see the advantages of this approach. Instead of +going off-track and focusing i.e. on fixing the `markdownlint`-issues^[As i have +often expirienced with LLMs trying to be too perfect and pleasing the user +without ever stopping] the agent notes that this is still an issue, but offers +to keep focused on the tasks for the moment. With a simple 'go' i can continue +tackle the next thing. 5 Minutes later everything is done and we have one less +issue. + +## Result + +In the fast-evolving field of AI-assisted software development, my experience +with Cursor's specification-driven workflow has yielded tangible outcomes: + +- **Code Quality**: By enforcing strict compliance checks and structured task + creation, redundant code generation was reduced. The LLM still gets excited to + "help", but now it's actually contributing meaningful changes instead of + reinventing the wheel. +- **Development Efficiency**: The PRD-first approach has cut try&error time + significantly. Before, getting the things i wanted was hit&miss. Now i "read + the agents mind" and correct it during specification-creation. +- **Project Integrity**: Through rigorous spec adherence and systematic task + breakdown, the codebase is more consistent, much better documented and + decently tested^[You know.. noone likes writing tests - and the person writing + the code should never write the tests anyway. If you haven't thought of + something while coding, chanches are, that you miss that edge-case during + testing as well.]. +- **Reduced Cognitive Load**: The Cursor rule system has made it easier to focus + on actual development tasks instead of wrestling with the AI's sometimes + questionable assumptions - or just installing a third different testing + framework for exactly this module. + +In this `gitlab_overviewer` case study, i tried to show at an easy example, that +this method works and _can_ yields great outcomes. Even small discrepancies in +the codebase tend to pop up during spec-reviews (which can be automated!). Next +up would be running those in some kind of CI-job and integrating tools like +issue-tracking into the agent instead of simple markdown-files in the repository +as makeshift issue-tracker. But not by me for the forseeable future, so if you +are looking for a project, feel free! + +**All in all this isn't a silver bullet for all AI-assisted development +problems, but it's made my coding experience with Cursor much more productive +and predictable. It turns out treating an AI as a slightly overeager junior +developer who needs clear instructions works better than hoping it'll just "do +the right thing".** diff --git a/Writing/images/initial_analysis.png b/Writing/images/initial_analysis.png new file mode 100644 index 0000000..5efd0d9 Binary files /dev/null and b/Writing/images/initial_analysis.png differ diff --git a/Writing/images/thoughts_after_commenting.png b/Writing/images/thoughts_after_commenting.png new file mode 100644 index 0000000..989f853 Binary files /dev/null and b/Writing/images/thoughts_after_commenting.png differ diff --git a/dist/About/Experience.html b/dist/About/Experience.html index 492bb18..e7981cf 100644 --- a/dist/About/Experience.html +++ b/dist/About/Experience.html @@ -328,6 +328,12 @@ ul.task-list li input[type="checkbox"] {