[
  {"arxiv_id": "2001.08361", "title": "Scaling Laws for Neural Language Models", "authors": "Kaplan, McCandlish, Henighan, Brown et al. (OpenAI)", "year": 2020, "month": "2020-01", "theme": "scaling_laws", "role": "seminal"},
  {"arxiv_id": "2203.15556", "title": "Training Compute-Optimal Large Language Models (Chinchilla)", "authors": "Hoffmann et al. (DeepMind)", "year": 2022, "month": "2022-03", "theme": "scaling_laws", "role": "compute-optimal correction"},
  {"arxiv_id": "2401.00448", "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws", "authors": "Sardana et al.", "year": 2024, "month": "2024-01", "theme": "scaling_laws", "role": "inference-aware extension"},

  {"arxiv_id": "2206.07682", "title": "Emergent Abilities of Large Language Models", "authors": "Wei, Tay, Bommasani, Raffel et al.", "year": 2022, "month": "2022-06", "theme": "emergent_abilities", "role": "seminal claim"},
  {"arxiv_id": "2304.15004", "title": "Are Emergent Abilities of Large Language Models a Mirage?", "authors": "Schaeffer, Miranda, Koyejo", "year": 2023, "month": "2023-04", "theme": "emergent_abilities", "role": "counter (NeurIPS 2023 Outstanding Paper)"},
  {"arxiv_id": "2503.05788", "title": "Emergent Abilities in Large Language Models: A Survey", "authors": "(2025 survey)", "year": 2025, "month": "2025-03", "theme": "emergent_abilities", "role": "synthesis"},

  {"arxiv_id": "2203.02155", "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)", "authors": "Ouyang, Wu, Jiang et al. (OpenAI)", "year": 2022, "month": "2022-03", "theme": "alignment_rlhf", "role": "seminal RLHF recipe"},
  {"arxiv_id": "2212.08073", "title": "Constitutional AI: Harmlessness from AI Feedback", "authors": "Bai et al. (Anthropic)", "year": 2022, "month": "2022-12", "theme": "alignment_rlhf", "role": "RLAIF / constitutional"},
  {"arxiv_id": "2305.18290", "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "authors": "Rafailov, Sharma, Mitchell, Ermon, Manning, Finn", "year": 2023, "month": "2023-05", "theme": "alignment_rlhf", "role": "DPO — default open-source alignment 2024+"},

  {"arxiv_id": "2005.14165", "title": "Language Models are Few-Shot Learners (GPT-3)", "authors": "Brown, Mann, Ryder et al. (OpenAI)", "year": 2020, "month": "2020-05", "theme": "icl_cot", "role": "in-context learning seminal"},
  {"arxiv_id": "2201.11903", "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", "authors": "Wei, Wang, Schuurmans, Bosma, Chi, Xia, Le, Zhou", "year": 2022, "month": "2022-01", "theme": "icl_cot", "role": "CoT seminal"},

  {"arxiv_id": "2002.08909", "title": "REALM: Retrieval-Augmented Language Model Pre-Training", "authors": "Guu, Lee, Tung, Pasupat, Chang", "year": 2020, "month": "2020-02", "theme": "rag", "role": "joint retriever+LM pretraining"},
  {"arxiv_id": "2005.11401", "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "authors": "Lewis et al. (FAIR)", "year": 2020, "month": "2020-05", "theme": "rag", "role": "canonical RAG framework"},

  {"arxiv_id": "1701.06538", "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", "authors": "Shazeer, Mirhoseini, Maziarz, Davis, Le, Hinton, Dean", "year": 2017, "month": "2017-01", "theme": "moe", "role": "MoE seminal"},
  {"arxiv_id": "2006.16668", "title": "GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding", "authors": "Lepikhin et al. (Google)", "year": 2020, "month": "2020-06", "theme": "moe", "role": "first practical 600B MoE"},
  {"arxiv_id": "2101.03961", "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity", "authors": "Fedus, Zoph, Shazeer", "year": 2021, "month": "2021-01", "theme": "moe", "role": "top-1 routing, trillion-param"},
  {"arxiv_id": "2112.06905", "title": "GLaM: Efficient Scaling of Language Models with Mixture-of-Experts", "authors": "Du et al. (Google)", "year": 2021, "month": "2021-12", "theme": "moe", "role": "1.2T params, 64 experts"},
  {"arxiv_id": "2401.04088", "title": "Mixtral of Experts (Mixtral 8x7B)", "authors": "Jiang et al. (Mistral AI)", "year": 2024, "month": "2024-01", "theme": "moe", "role": "first open MoE matching GPT-3.5"},

  {"arxiv_id": "2103.00020", "title": "Learning Transferable Visual Models From Natural Language Supervision (CLIP)", "authors": "Radford, Kim et al. (OpenAI)", "year": 2021, "month": "2021-02", "theme": "multimodal", "role": "contrastive image-text seminal"},
  {"arxiv_id": "2102.12092", "title": "Zero-Shot Text-to-Image Generation (DALL-E)", "authors": "Ramesh et al. (OpenAI)", "year": 2021, "month": "2021-02", "theme": "multimodal", "role": "text-to-image transformer"},
  {"arxiv_id": "2204.14198", "title": "Flamingo: a Visual Language Model for Few-Shot Learning", "authors": "Alayrac et al. (DeepMind)", "year": 2022, "month": "2022-04", "theme": "multimodal", "role": "VLM with gated cross-attention"},
  {"arxiv_id": "2204.06125", "title": "Hierarchical Text-Conditional Image Generation with CLIP Latents (DALL-E 2)", "authors": "Ramesh, Dhariwal, Nichol, Chu, Chen", "year": 2022, "month": "2022-04", "theme": "multimodal", "role": "CLIP-latent diffusion"},
  {"arxiv_id": "2312.11805", "title": "Gemini: A Family of Highly Capable Multimodal Models", "authors": "Gemini Team (Google)", "year": 2023, "month": "2023-12", "theme": "multimodal", "role": "first native-multimodal frontier model"},

  {"arxiv_id": "2210.03629", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": "Yao, Zhao, Yu, Du, Shafran, Narasimhan, Cao", "year": 2022, "month": "2022-10", "theme": "agents_tools", "role": "reasoning+acting seminal"},
  {"arxiv_id": "2302.04761", "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", "authors": "Schick, Dwivedi-Yu et al. (Meta)", "year": 2023, "month": "2023-02", "theme": "agents_tools", "role": "self-supervised tool use"},
  {"arxiv_id": "2412.17481", "title": "A Survey on LLM-based Multi-Agent System: Recent Advances and New Frontiers", "authors": "(2024 survey)", "year": 2024, "month": "2024-12", "theme": "agents_tools", "role": "subfield recognition"},
  {"arxiv_id": "2503.16416", "title": "A Survey on Evaluation of LLM-based Agents", "authors": "(2025 survey)", "year": 2025, "month": "2025-03", "theme": "agents_tools", "role": "evaluation maturity"},

  {"arxiv_id": "1804.07461", "title": "GLUE: A Multi-Task Benchmark and Analysis Platform for NLU", "authors": "Wang, Singh, Michael, Hill, Levy, Bowman", "year": 2018, "month": "2018-04", "theme": "benchmarks", "role": "first major NLU benchmark"},
  {"arxiv_id": "1905.00537", "title": "SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems", "authors": "Wang et al.", "year": 2019, "month": "2019-05", "theme": "benchmarks", "role": "harder successor after GLUE saturated"},
  {"arxiv_id": "2009.03300", "title": "Measuring Massive Multitask Language Understanding (MMLU)", "authors": "Hendrycks, Burns, Basart, Zou, Mazeika, Song, Steinhardt", "year": 2020, "month": "2020-09", "theme": "benchmarks", "role": "knowledge-breadth benchmark, default 2021-2024"},
  {"arxiv_id": "2206.04615", "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models (BIG-bench)", "authors": "Srivastava et al.", "year": 2022, "month": "2022-06", "theme": "benchmarks", "role": "204 tasks, 442 authors"},
  {"arxiv_id": "2211.09110", "title": "Holistic Evaluation of Language Models (HELM)", "authors": "Liang et al. (Stanford CRFM)", "year": 2022, "month": "2022-11", "theme": "benchmarks", "role": "16 scenarios x 7 metrics, holistic"},

  {"arxiv_id": "2302.13971", "title": "LLaMA: Open and Efficient Foundation Language Models", "authors": "Touvron et al. (Meta)", "year": 2023, "month": "2023-02", "theme": "open_weights", "role": "reset open-weight frontier (leaked Mar 2023)"},
  {"arxiv_id": "2307.09288", "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", "authors": "Touvron et al. (Meta)", "year": 2023, "month": "2023-07", "theme": "open_weights", "role": "first openly licensed frontier model"},
  {"arxiv_id": "2310.06825", "title": "Mistral 7B", "authors": "Jiang et al. (Mistral AI)", "year": 2023, "month": "2023-10", "theme": "open_weights", "role": "Apache 2.0, beat Llama 2 13B"},
  {"arxiv_id": "2407.21783", "title": "The Llama 3 Herd of Models", "authors": "Meta AI", "year": 2024, "month": "2024-07", "theme": "open_weights", "role": "405B competitive with GPT-4/Claude 3.5"},
  {"arxiv_id": "2412.19437", "title": "DeepSeek-V3 Technical Report", "authors": "DeepSeek AI", "year": 2024, "month": "2024-12", "theme": "open_weights", "role": "open MoE frontier"},
  {"arxiv_id": "2501.12948", "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", "authors": "DeepSeek AI", "year": 2025, "month": "2025-01", "theme": "open_weights", "role": "open-weight reasoning parity with o1"},

  {"arxiv_id": "2108.07258", "title": "On the Opportunities and Risks of Foundation Models", "authors": "Bommasani et al. (Stanford CRFM)", "year": 2021, "month": "2021-08", "theme": "roadmap_survey", "role": "agenda-setting"},
  {"arxiv_id": "2303.18223", "title": "A Survey of Large Language Models", "authors": "Zhao, Zhou, Li, Tang et al.", "year": 2023, "month": "2023-03", "theme": "roadmap_survey", "role": "most-cited LLM survey"},
  {"arxiv_id": "2304.02020", "title": "A Bibliometric Review of Large Language Models Research from 2017 to 2023", "authors": "Fan, Li, Ma, Lee, Yu, Hemphill", "year": 2023, "month": "2023-04", "theme": "bibliometric", "role": "ACM TIST 2024; ~5,000 publications analyzed"}
]
