<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD with MathML3 v1.2 20190208//EN" "JATS-journalpublishing1-mathml3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.2" xml:lang="en">
<front>
<journal-meta><journal-id journal-id-type="publisher-id">BIOLING</journal-id><journal-id journal-id-type="nlm-ta">Biolinguistics</journal-id>
<journal-title-group>
<journal-title>Biolinguistics</journal-title><abbrev-journal-title abbrev-type="pubmed">Biolinguistics</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1450-3417</issn>
<publisher><publisher-name>PsychOpen</publisher-name></publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">bioling.19021</article-id>
<article-id pub-id-type="doi">10.5964/bioling.19021</article-id>
<article-categories>
<subj-group subj-group-type="heading"><subject>Forum</subject></subj-group>


<subj-group subj-group-type="badge">
<subject>Data</subject>
</subj-group>


</article-categories>
<title-group>
<article-title>Fundamental Principles of Linguistic Structure Are Not Represented by ChatGPT</article-title>
	<alt-title alt-title-type="right-running">Fundamental Principles of Linguistic Structure Are Not Represented by ChatGPT</alt-title>
<alt-title specific-use="APA-reference-style" xml:lang="en">Fundamental principles of linguistic structure are not represented by ChatGPT</alt-title>
</title-group>
<contrib-group>
	<contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Murphy</surname><given-names>Elliot</given-names></name><xref ref-type="corresp" rid="cor1">*</xref><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>Leivada</surname><given-names>Evelina</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref><xref ref-type="aff" rid="aff4"><sup>4</sup></xref></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>Dentella</surname><given-names>Vittoria</given-names></name><xref ref-type="aff" rid="aff5"><sup>5</sup></xref></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>Montero</surname><given-names>Raquel</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>Günther</surname><given-names>Fritz</given-names></name><xref ref-type="aff" rid="aff6"><sup>6</sup></xref></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>Marcus</surname><given-names>Gary</given-names></name><xref ref-type="aff" rid="aff7"><sup>7</sup></xref></contrib>
<contrib contrib-type="editor">
<name>
	<surname>Grohmann</surname>
	<given-names>Kleanthes K.</given-names>
</name>
<xref ref-type="aff" rid="aff8"/>
</contrib>
<aff id="aff1"><label>1</label><institution>Vivian L. Smith Department of Neurosurgery, UTHealth</institution>, <addr-line><city>Houston</city>, <state>TX</state></addr-line>, <country country="US">USA</country></aff>
<aff id="aff2"><label>2</label><institution>Texas Institute for Restorative Neurotechnologies, UTHealth</institution>, <addr-line><city>Houston</city>, <state>TX</state></addr-line>, <country country="US">USA</country></aff>
	<aff id="aff3"><label>3</label>Departament de Filologia Catalana, <institution>Universitat Autònoma de Barcelona</institution>, <addr-line><city>Barcelona</city></addr-line>, <country country="ES">Spain</country></aff>
<aff id="aff4"><label>4</label><institution>Institució Catalana de Recerca i Estudis Avançats (ICREA)</institution>, <addr-line><city>Barcelona</city></addr-line>, <country country="ES">Spain</country></aff>
	<aff id="aff5"><label>5</label>Department of Brain and Behavioral Sciences, <institution>University of Pavia</institution>, <addr-line><city>Pavia</city></addr-line>, <country country="IT">Italy</country></aff>
	<aff id="aff6"><label>6</label>Institut für Psychologie, <institution>Humboldt-Universität zu Berlin</institution>, <addr-line><city>Berlin</city></addr-line>, <country country="DE">Germany</country></aff>
	<aff id="aff7"><label>7</label>Department of Psychology, <institution>New York University</institution>, <addr-line><city>New York</city>, <state>NY</state></addr-line>, <country country="US">USA</country></aff>
	<aff id="aff8">University of Cyprus, Nicosia, <country>Cyprus</country></aff>
</contrib-group>
<author-notes>
	<corresp id="cor1"><label>*</label>Texas Institute for Restorative Neurotechnologies, 1133 John Freeman Blvd, Houston, TX 77030, USA. <email xlink:href="elliot.murphy@uth.tmc.edu">elliot.murphy@uth.tmc.edu</email></corresp>
</author-notes>
<pub-date date-type="pub" publication-format="electronic"><day>04</day><month>12</month><year>2025</year></pub-date>
	<pub-date pub-type="collection" publication-format="electronic"><year>2025</year></pub-date>
<pub-date pub-type="collection" publication-format="electronic"><year>2025</year></pub-date>
<volume>19</volume>
<elocation-id>e19021</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>09</month>
<year>2025</year>
</date>
</history>
<permissions><copyright-year>2025</copyright-year><copyright-holder>Murphy, Leivada, Dentella et al.</copyright-holder><license license-type="open-access" specific-use="CC BY 4.0" xlink:href="https://creativecommons.org/licenses/by/4.0/"><ali:license_ref>https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open access article distributed under the terms of the Creative Commons Attribution 4.0 International License, CC BY 4.0, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p></license></permissions>
<abstract>
	<p>A core component of a successful artificial general intelligence would be the rapid creation and manipulation of grounded compositional abstractions and the demonstration of expertise in the family of recursive hierarchical syntactic objects necessary for the creative use of human language. We evaluated the recently released o3 model (OpenAI; o3-mini-high) from ChatGPT and discovered that while it succeeds on some basic linguistic tests relying on linear, surface statistics (e.g., the Strawberry Test), it fails to generalize basic phrase structure rules; it fails with comparative sentences involving semantically illegal cardinality comparisons (‘Escher sentences’); it fails to correctly rate and explain acceptability dynamics; and it fails to distinguish between instructions to generate unacceptable semantic vs. unacceptable syntactic outputs. When tasked with generating simple violations of grammatical rules, it is seemingly incapable of representing multiple parses to evaluate against various possible semantic interpretations. We ran all of these prompts multiple times again through the API and provide basic accuracy scores. In stark contrast to many recent claims that artificial language models are on the verge of replacing the field of linguistics, our results suggest not only that deep learning is hitting a wall with respect to compositionality (<xref ref-type="bibr" rid="r25">Marcus, 2022</xref>), but that it is hitting [<italic>a</italic> [<italic>stubbornly</italic> [<italic>resilient wall</italic>]]] that cannot readily be surmounted to reach human-like compositional reasoning simply through more compute.</p>
</abstract>
<kwd-group kwd-group-type="author"><kwd>compositionality</kwd><kwd>syntax</kwd><kwd>OpenAI</kwd><kwd>o3</kwd><kwd>semantics</kwd></kwd-group>

</article-meta>
</front>
<body>
<sec id="sec1" sec-type="intro"><title>1. Introduction</title>
	<p>Large language models—deep neural nets trained in next-word prediction in a large corpus of text—have proven capable of parsing the complex sequential statistics of written text without many obvious grammatical errors (<xref ref-type="bibr" rid="r4">Besta et al., 2025</xref>; <xref ref-type="bibr" rid="r22">Lindström, 2024</xref>; <xref ref-type="bibr" rid="r54">Russin et al., 2024</xref>; <xref ref-type="bibr" rid="r63">Zhao &amp; Zhang, 2024</xref>). This has spurred many to deem them capable of human-like compositionality, in particular with respect to syntax-semantics (<xref ref-type="bibr" rid="r23">Mahowald et al., 2024</xref>). Some have even claimed that “large language models are better than theoretical linguists at theoretical linguistics” (<xref ref-type="bibr" rid="r64">Ambridge &amp; Blything, 2024</xref>), and that we are facing “the end of (generative) linguistics as we know it” (<xref ref-type="bibr" rid="r6">Chesi Forthcoming</xref>) (although Chesi qualifies that many modern approaches to generative grammar are arguably just as architecturally opaque as LLMs). This would be an extremely consequential state of affairs—if it can be shown to be true. Yet, much recent work indicates that they merely <italic>emulate human language</italic> (<xref ref-type="bibr" rid="r8">Dentella et al., 2023</xref>, <xref ref-type="bibr" rid="r9">2024</xref>; <xref ref-type="bibr" rid="r16">Katzir, 2023</xref>; <xref ref-type="bibr" rid="r55">Schaeffer et al., 2023</xref>) as opposed to being in possession of human-like syntactic competence.</p>
<p>In this report, the most recent reasoning model from OpenAI (o3-mini-high) is assessed for its ability to assess and generate compositonal representations. o3, like other ‘reasoning models’, is based on large language models but includes additional modules to improve certain computational functions and multi-step logical reasoning. Others have already expressed scepticism about the promise of o3. For example, its recent high performance on the ARC-AGI test “is not due to intelligence but due to the application of knowledge and computing resources that together enable an effective search in the given space of possible solutions” (<xref ref-type="bibr" rid="r51">Pfister &amp; Jud, 2025</xref>). We agree in principle with the assessment in <xref ref-type="bibr" rid="r30">Mollica and Piantadosi (2022)</xref> that “Linguistic corpora are a low-dimensional projection of both syntax and thought, so it is not implausible that a smart learning system could recover at least some aspects of these cognitive systems from watching text alone”. The critical challenge, as ever, is to demonstrate this capacity <italic>empirically</italic>.</p>
<p>In our report, a number of basic flaws are discovered and noted with respect to the linguistic capabilities of o3. These pertain to fundamental properties of basic sentence structure building and semantic evaluation.</p></sec>
<sec id="sec2" sec-type="methods"><title>2. Method</title>
<p>We identified a number of basic linguistic processes, and a number of more hierarchically complex computations, to subject to direct investigation. o3-mini (<xref ref-type="bibr" rid="r46">OpenAI, 2025</xref>) was prompted via OpenAI’s API with the reasoning effort set to ‘high’ (as of September 2025, OpenAI does not have a temperature parameter that the user can specify for this model). To maximize reproducibility as much as possible, an integer seed was specified for each prompt and the system reference field was saved. Lastly, to explore the consistency in the responses, the model was asked 3 times the same prompt. Prompts are directly reproduced in highlighted red boxes, and responses are directly reproduced below. Given the preliminary nature of our experimental report we refrain from conducting analyses such as logistic regressions and provide only elementary descriptive statistics. More systematic analyses will be forthcoming in future work.</p></sec>
<sec id="sec3" sec-type="results"><title>3. Results</title>
<p>We begin with initially unproblematic tests for the model (Prompts 1-7), before moving to more complex tests that proved problematic (Prompts 8-26).</p>
<sec id="sec3.1"><title>3.1. Linear Order and Basic Dependencies</title>
<p>Starting first with some basic linear-based computations that do not involve higher-order compositional reasoning, o3-mini-high was able to successfully generate the following responses.</p>
	
<disp-quote>
<p><bold>Prompt 1</bold></p>
	<p><?disable-indent?>Generate a palindrome that uses the word ‘knight’.</p></disp-quote>
	
	<fig id="f1" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f1" position="anchor" orientation="portrait"/></fig>
	
<p>It passed the ‘strawberry test’.</p>
	
<disp-quote>
<p><bold>Prompt 2</bold></p>
	<p><?disable-indent?>How many occurrences of the letter ‘r’ are there in ‘strawberry’?</p></disp-quote>
	
	<fig id="f2" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f2" position="anchor" orientation="portrait"/></fig>
	
	<p>The model was prompted with the following series of requests, and delivered reasonable responses for all (Prompts 3-7).</p>
	
<disp-quote>
<p><bold>Prompt 3</bold></p>
	<p><?disable-indent?>In the sentence ‘Professors were mean but teachers were nice, they were likely moody’, who does ‘they’ refer to?</p></disp-quote>
	
	<fig id="f3" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f3" position="anchor" orientation="portrait"/></fig>
	
	<disp-quote><p><bold>Prompt 4</bold></p>
	<p><?disable-indent?>In the sentence ‘Teachers were nice but professors were mean, they were likely moody’, who does ‘they’ refer to?</p></disp-quote>
	
	<fig id="f4" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f4" position="anchor" orientation="portrait"/></fig>
	
	<disp-quote><p><bold>Prompt 5</bold></p>
	<p><?disable-indent?>In the sentence ‘Βill was happy but Mary was sad, he was probably overworked’, who does ‘he’ refer to?</p></disp-quote>
	
	<fig id="f5" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f5" position="anchor" orientation="portrait"/></fig>
	
	<disp-quote><p><bold>Prompt 6</bold></p>
	<p><?disable-indent?>Does this sentence make sense to you?</p>
</disp-quote>
	
	<fig id="f6" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f6" position="anchor" orientation="portrait"/></fig>
</sec>
	
<sec id="sec3.2"><title>3.2. Phrase Structure</title>
<p>Next, the model was tested for basic phrase structure representations.</p>
	
<disp-quote>
<p><bold>Prompt 7</bold></p>
	<p><?disable-indent?>Is ‘Dogs dogs dog dog dogs’ grammatical?</p></disp-quote>
	
	<fig id="f7" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f7" position="anchor" orientation="portrait"/></fig>
	
<p>Dog/dogs was then substituted for an invented pseudoword (Prompt 8). When presented with an ungrammatical structure (a superfluous ‘glarts’ was added to the grammatical 5-word formula above), the model incorrectly claimed that this was grammatical. The reasoning provided was fallacious, confusing the role of the middle words and mis-understanding the role of the final words.</p>
	
<disp-quote>
<p><bold>Prompt 8</bold></p>
	<p><?disable-indent?>Pretend that ‘glart’ is a word that refers to a group of alien creatures, and can also refer to the action of pleasing. In this context, is ‘Glarts glarts glart glart glarts glarts’ grammatical?</p></disp-quote>
	
	<fig id="f8" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f8" position="anchor" orientation="portrait"/></fig>
	
<p>When prompted with an even more preposterous example (adding three additional “glarts” to the end of the initially grammatical “Glarts glarts glart glart glarts”), the model generated an inaccurate tree structure that was not faithful to the string input (by mistakenly including more than two instances of “glart”) and declared it to be grammatical.</p>
	
<disp-quote>
<p><bold>Prompt 9</bold></p>
	<p><?disable-indent?>Given the same context as above, is ‘Glarts glarts glart glart glarts glarts glarts glarts’ grammatical?</p></disp-quote>
	
	<fig id="f9" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f9" position="anchor" orientation="portrait"/></fig>
	
	<fig id="f10" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f10" position="anchor" orientation="portrait"/></fig>

</sec>
<sec id="sec3.3"><title>3.3. Escher Sentences</title>
<p>Next, we turned to comparative sentences involving semantically illegal cardinality comparisons (sometimes termed ‘Escher sentences’). o3-mini-high failed to parse the comparative illusion, noting only the structural acceptability, despite the sentence being ungrammatical.</p>
	
<disp-quote>
<p><bold>Prompt 10</bold></p>
	<p><?disable-indent?>Is the sentence ‘Fewer athletes have been to Beijing than I have’ acceptable?</p></disp-quote>
	
	<fig id="f11" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f11" position="anchor" orientation="portrait"/></fig>

	<disp-quote>
<p><bold>Prompt 11</bold></p>
	<p><?disable-indent?>Is the sentence ‘More women have finished university than he has’ acceptable?</p></disp-quote>
	
	<fig id="f12" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f12" position="anchor" orientation="portrait"/></fig></sec>
<sec id="sec3.4"><title>3.4. Center-Embedding</title>
<p>We tested center-embedding acceptability. The model failed to detect ungrammaticality due to a missing verb (or superfluous Noun Phrase). The reasoning provided was flawed and included some hallucination of pronominal elements (although the model helpfully does not recommend this sentence “for everyday use”!).</p>
	
<disp-quote>
<p><bold>Prompt 12</bold></p>
	<p><?disable-indent?>Is ‘The doctor the nurse the hospital had hired met John?’ acceptable?</p></disp-quote>
	
	<fig id="f13" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f13" position="anchor" orientation="portrait"/></fig>
	
<p>With the next prompt below, the model fallaciously injects an additional ‘met’ not present in the prompt.</p>
	
<disp-quote>
<p><bold>Prompt 13</bold></p>
	<p><?disable-indent?>Draw me a syntactic tree structure, in line with Minimalist syntax, for the sentence ‘The doctor the nurse the hospital had hired met John’.</p></disp-quote>
	
	<fig id="f14" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f14" position="anchor" orientation="portrait"/></fig>
	
	<fig id="f15" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f15" position="anchor" orientation="portrait"/></fig>
	
<p>The syntactic tree structure generated above does not successfully map onto the interpretation seemingly intended by the model’s output.</p></sec>
<sec id="sec3.5"><title>3.5. Paradox</title>
<p>When pushed with the below prompt – a more audacious semantic task – the response does not deliver on its stated reasoning plan to inject ‘ambiguity’ and ‘intricacy’. There is no element of surprisal or contradiction beyond the standard liar’s paradox (i.e., “This statement is false”). Of course, it is perhaps unfair and unreasonable to expect the model to create a paradox “more complex and sophisticated than all existing paradoxes or puzzles”, but the point to be made here concerns the means through which the model judged and reasoned on its attempt.</p>
<disp-quote>
<p><bold>Prompt 14</bold></p>
	<p><?disable-indent?>Create for me a semantic or philosophical paradox that is more complex and sophisticated than all existing paradoxes or puzzles.</p></disp-quote><fig id="f16" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f16" position="anchor" orientation="portrait"/></fig></sec>
<sec id="sec3.6"><title>3.6. Generation of Syntactic Violations</title>
<p>When asked to generate an ungrammatical sentence (an easy task for humans – one could simply recite a list of random words), the model struggled in a rather peculiar way. It tried to generate a semantically anomalous output but the sentence that was generated was entirely grammatical and acceptable. The final step in its chain of reasoning claimed to ensure the presence of “a series of ungrammatical, confusing phrases” —which were not present in the output. The model’s output may be rather Joycean and surreal in its content, but it does not satisfy the clear instruction to be ungramatical.</p>
<disp-quote>
<p><bold>Prompt 15</bold></p>
	<p><?disable-indent?>Can you generate for me a complex sentence that merges multiple modifiers and clauses into an ungrammatical output. Please ensure that the sentence is ungrammatical with no room for ambiguity.</p></disp-quote><fig id="f17" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f17" position="anchor" orientation="portrait"/></fig>
<p>The model also failed to provide a basic example of an ungrammatical sentence in line with a rather direct task of violation-formation.</p>
<disp-quote>
<p><bold>Prompt 16</bold></p>
	<p><?disable-indent?>Generate an English sentence that violates a recursive application of a grammatical rule. Please choose any syntactic rule you like.</p></disp-quote><fig id="f18" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f18" position="anchor" orientation="portrait"/></fig>
<p>The model provided a grammatical English sentence, with fallacious reasoning as to its putative unacceptability (“Who did who see?” is a common multiple <italic>wh</italic>-question seeking the agent and participant of a seeing event). Technically, the model makes a valid point about the presence of multiple <italic>wh</italic>-operators often leading to illegal read-outs, but failed to then reflect on the other possible readings of the simple four-word string it outputted and claimed to be fundamentally ungrammatical. This provides a more stringent test for (the lack of) compositional syntax than the more common tests recently used that simply task language models with dispassionately generating strings of discourse with certain stylistic qualities (<xref ref-type="bibr" rid="r48">Piantadosi, 2024</xref>).</p></sec>
<sec id="sec3.7"><title>3.7. Generation of Multiple Syntactic Violations</title>
<p>Next, o3-mini-high failed in a number of ways with the following prompts designed to test the parsing of multiple, related syntactic representations.</p>
<disp-quote>
<p><bold>Prompt 17</bold></p>
	<p><?disable-indent?>Generate two sentences. The first sentence must contain one type of syntactic violation. The second sentence must continue the discourse content from the first, but must contain a different type of syntactic violation that explicitly is caused by some type of relation or connection with the first sentence. Draw a Minimalist tree structure to map the explicit coordination of these multiple error types.</p></disp-quote><fig id="f19" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f19" position="anchor" orientation="portrait"/></fig><fig id="f20" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f20" position="anchor" orientation="portrait"/></fig><fig id="f21" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f21" position="anchor" orientation="portrait"/></fig>
<p>The model fails to take account of the fact that Sentence 1 (‘The pair of scholars debate their thesis in a hurried conference’) is perfectly grammatical under standard English present tense. It focused only on how ‘pair’ is singular and so ‘would require “debates”’ – seemingly incapable of parsing interactional syntactic dynamics that require multiple steps to construct and evaluate against various possible semantic interpretations. Instead, it seemed limited to evaluating syntactic violations on a <italic>mono-configurational basis</italic>, failing to reflect on how one possible violation type could directly lead to multiple different types of acceptability under standard English syntax. In other words, humans would readily notice that while Sentence 1 may technically violate one typically expected form of agreement relation, it does not preclude the string from being subject to a wholly standard and acceptable interpretation.</p>
<p>Next, consider Sentence 2 (‘Owing to this faulty construction, themselves misinterpreting the rule from the previous discussion, the committee postponed the session’). This sentence is also (awkwardly) grammatical under basic movement applications allowing ‘themselves’ to be interpreted with ‘the committee’. Interestingly, it also appears here that the content of the prompt has influenced the semantics of Sentence 2—which makes reference to some form of rule misinterpretation. The model seems incapable of abstracting away from the basic instruction to generate syntactic violations and provide a semantic representation that is wholly independent from aspects of statistical inferences made from the prompt. On top of this, Sentence 1 and 2 do not in fact form a coherent discourse continuation, as explicitly requested in the prompt.</p>
<p>The accompanying tree structure that was generated does not accurately represent the semantics of the two separate sentences, and appears to try and represent ‘postponed the session’ without any clear syntactic categorization.</p>
<p>Note also that the final explanation for these sentences focuses explicitly on the basic possible agreement relation between two discrete elements (‘The pair’ and ‘themselves’), rather than taking a more global syntactic assessment of the role of these two elements <italic>in the context</italic> of their respective syntactic structures. Not only does the model fail to generate clear syntactic violations, it also fails to provide a level of discourse coherence that is independent of the semantics of the prompt.</p>
<p>When these types of errors were presented to the model (Prompt 18), it provided two sentences that did indeed exhibit a coherent discourse relation. However, it still failed to generate a syntactic violation in Sentence 2 that relied on explicit properties of Sentence&nbsp;1.</p>
<disp-quote>
<p><bold>Prompt 18</bold></p>
	<p><?disable-indent?>Both Sentence 1 and Sentence 2 are grammatical English sentences. For example, Sentence 1 means ‘There are two scholars and they are presently debating their thesis’. Sentence 2 means that the committee - who were misinterpreting the rule from the previous discussion - postponed the session, and that this was due to ‘this faulty construction’. It is also unclear what the discourse relation is between Sentence 1 and 2. Sentence 1 is about monks and theses, and Sentence 2 is about committees and constructions.</p></disp-quote><fig id="f22" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f22" position="anchor" orientation="portrait"/></fig><fig id="f23" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f23" position="anchor" orientation="portrait"/></fig>
<p>The model highlighted how ‘This reverberation’ in Sentence 2 is related in meaning to the previous sentence—which is irrelevant to the requested task of relating the syntactic violation itself (and not just the semantics) in Sentence 2 back to Sentence 1 (recall that Prompt 17 requests “a different type of syntactic violation that explicitly is <italic>caused by some type of relation</italic> or connection with the first sentence”; the request here is that the violation itself is causally driven by properties of the first sentence, and not simply linked to its meaning).</p>
<p>When these further errors were presented to the model, it ultimately succeeded in generating two separate types of syntactic violations for Sentence 1 and 2. Yet, while the discourse relation between the sentences was salient, the syntactic violation in Sentence 2 still did not satisfy the request of being directly linked to properties of Sentence 1 (achieving this successfully could easily have been achieved via Binding restrictions or 𝜑-feature violations, for example). The tree structure provided was also insuffiently transparent as to the core syntactic relations between elements.</p>
<disp-quote>
<p><bold>Prompt 19</bold></p>
	<p><?disable-indent?>You have simply repeated the same type of violation across both sentences - you have not generated a second sentence whose violation is directly linked to properties of the first sentence.</p></disp-quote><fig id="f24" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f24" position="anchor" orientation="portrait"/></fig><fig id="f25" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f25" position="anchor" orientation="portrait"/></fig>
<p>When the model was once again corrected on this point, it provided two sentences that had the same type of syntactic violations (rather than different types), and the violation in Sentence 2 was again only related to the meaning of Sentence 1 but had zero connection to its syntactic configuration.</p>
<disp-quote>
<p><bold>Prompt 20</bold></p>
	<p><?disable-indent?>You have only linked Sentence 2’s violation back to discourse features of Sentence 1. I would like you to generate a violation in Sentence 2 that is linked to syntactic properties of Sentence 1.</p></disp-quote><fig id="f26" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f26" position="anchor" orientation="portrait"/></fig><fig id="f27" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f27" position="anchor" orientation="portrait"/></fig><fig id="f28" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f28" position="anchor" orientation="portrait"/></fig>
<p>The model believed that this was a success because the Sentence 2 violation ‘is directly inherited from a syntactic property (the double wh‐extraction) introduced in Sentence 1’ – even though the extraction in Sentence 2 is purely bound to properties of Sentence 2 itself, with no connection to syntactic features in Sentence 1. While the presence of ‘his’ in Sentence 2 does indeed refer back to ‘The senator’, the wh-extraction constitutes a violation for independent reasons, and so does not satisfy the requests of (i) <italic>generating two different types of syntactic violations</italic>, and (ii) <italic>forming the second violation via a direct connection to syntactic properties of Sentence 1</italic>.</p>
<p>To summarize this line of inquiry, we provided in total 6 successive prompts (over Section 3.6-3.7) requesting types of violations, and we plot below the success of the model in satisfying various of these requests as they pertain to elements of structure and meaning.</p>
<table-wrap id="t1" position="anchor" orientation="portrait">
<label>Table 1</label><caption><title>Representation of the Success of o3-Mini-High in Generating Different Types of Syntactic Violations; ‘No’ and ‘Yes’ Indicate Failure or Success</title></caption>
<table frame="hsides" rules="groups">
<col width="13%" align="left"/>
<col width="29%"/>
<col width="29%"/>
<col width="29%"/>
<thead>
<tr>
<th>Prompt #</th>
<th>Unacceptable Structure</th>
<th>Multiple Violation Types</th>
<th>Causally Driven Violation</th>
</tr>
</thead>
<tbody>
<tr>
<td>15</td>
	<td style="background-color: #ED6161">No</td>
<td>N/A</td>
<td>N/A</td>
</tr>
<tr>
<td>16</td>
	<td style="background-color: #ED6161">No</td>
<td>N/A</td>
<td>N/A</td>
</tr>
<tr>
<td>17</td>
	<td style="background-color: #ED6161">No</td>
	<td style="background-color: #ED6161">No</td>
	<td style="background-color: #ED6161">No</td>
</tr>
<tr>
<td>18</td>
	<td style="background-color: #8DEB9B">Yes</td>
	<td style="background-color: #8DEB9B">Yes</td>
	<td style="background-color: #ED6161">No</td>
</tr>
<tr>
<td>19</td>
	<td style="background-color: #8DEB9B">Yes</td>
	<td style="background-color: #8DEB9B">Yes</td>
	<td style="background-color: #ED6161">No</td>
</tr>
<tr>
<td>20</td>
	<td style="background-color: #8DEB9B">Yes</td>
	<td style="background-color: #ED6161">No</td>
	<td style="background-color: #ED6161">No</td>
</tr>
</tbody>
</table>
</table-wrap></sec>
<sec id="sec3.8"><title>3.8. Scope</title>
<p>Next, we turned to scope ambiguities (<xref ref-type="bibr" rid="r15">Kamath et al., 2024</xref>). o3-mini-high correctly identified Option A as the most commonly selected option (Prompt 21), but it did not provide any logical reasoning for why Option B below could in principle be true.</p>
<disp-quote>
<p><bold>Prompt 21</bold></p>
	<p><?disable-indent?>There are exactly six chairs evenly spaced around a circular table. On the basis of this statement alone, and with no further context, there are two options:</p>
	<p><?disable-indent?>A: The six different chairs are all around the same table.</p>
	<p><?disable-indent?>B: The six chairs aren’t all around the same table.</p>
	<p><?disable-indent?>Specifically in relation to this context, which of these two options is most likely?</p></disp-quote><fig id="f29" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f29" position="anchor" orientation="portrait"/></fig>
<p>The model’s logic implies that ‘a chair’ must semantically only refer to an absolute singular entity due purely to its grammatical features, which ignores how some interactional property of the syntactic features of the word and its role in a compositional structure could influence an alternative meaning to shift between broad and narrow scope readings (i.e., three chairs could surround Table A, and the other three chairs could surround Table B). This points to a lack of human-like arbitration between possible semantic representations delivered by a grammatical configuration and world knowledge.</p>
<p>Assessing the three bullet points in the explanation: When deciding between Options A and B, (i) there are many sentences that include the string ‘a circular table’ that readily result in an interpretation of multiple different tables (e.g., ‘Each Prince was gifted a circular table’); (ii) the even spacing does not strictly pertain to the decision at hand; and (iii) the model’s descripion of ‘Contextual Convention’ only begs the question by invoking circular reasoning (i.e., the sentence means X because it means X).</p></sec>
	<sec id="sec3.9"><title>3.9. Assessment of Grammaticality</title>
		<p>We asked the model to assess the acceptability (<xref ref-type="bibr" rid="r58">Tjuatja et al., 2024</xref>) and grammaticality of 16 sentences. Sentences (1)-(11) were ungrammatical, and the model successfully identified these as such. These ungrammatical sentences contained common violations discussed in the literature, such as adjunct islands, <italic>whether</italic>-islands, and binding condition violations. Sentences (12)-(16) were grammatical. However, the model incorrectly claimed that (12), (15) and (16) were ungrammatical, and its explanation for why (14) is grammatical was incorrect. Below the prompt, we focus on the responses pertinent to (12)-(16) since these were the items causing errors. This prompt is assuredly complex, but if artificial models are “better than theoretical linguists at theoretical linguistics” (<xref ref-type="bibr" rid="r64">Ambridge &amp; Blything, 2024</xref>) we might expect some general successes.</p>
<disp-quote>
<p><bold>Prompt 22</bold></p>
	<p><?disable-indent?>Please assess the following sentences for their acceptability and grammaticality. Explain how each of the sentences either does or does not violate any number of linguistic rules.</p>
	<p><?disable-indent?>1) The journalists said that Trump lied about each other.</p>
	<p><?disable-indent?>2) Mike tries will win.</p>
	<p><?disable-indent?>3) The man expected the client to shoot each other.</p>
	<p><?disable-indent?>4) For themselves to decide to go would be absurd.</p>
	<p><?disable-indent?>5) For each other to lose would be disgraceful.</p>
	<p><?disable-indent?>6) Sam believes to be intelligent.</p>
	<p><?disable-indent?>7) Kim expects Saul to like herself.</p>
	<p><?disable-indent?>8) I talked about Dale to himself.</p>
	<p><?disable-indent?>9) Who did Tom talk with Sally after seeing?</p>
	<p><?disable-indent?>10) Who does Diane wonder whether Cooper likes?</p>
	<p><?disable-indent?>11) What did you make the claim that Kyle bought?</p>
	<p><?disable-indent?>12) John likes Mary's picture of himself.</p>
	<p><?disable-indent?>13) John likes Mary's picture of herself.</p>
	<p><?disable-indent?>14) Jimmy expected Saul to win himself.</p>
	<p><?disable-indent?>15) Jimmy expected himself to win Saul.</p>
	<p><?disable-indent?>16) We think that they expected that pictures of each other would be in the room.</p></disp-quote><fig id="f30" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f30" position="anchor" orientation="portrait"/></fig><fig id="f31" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f31" position="anchor" orientation="portrait"/></fig>
	<p>Below is a summary chart for the accuracy of o3-mini-high in identifying unacceptable and acceptable sentences (<italic>we caveat this by highlighting the limited sample size and non-systematic assessment</italic>).</p><fig id="f32" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 1</label><caption>
<title>Bar Chart Representing Classification Accuracy for o3-Mini-High for Unacceptable and Acceptable Sentences</title></caption><graphic xlink:href="bioling.19021-f32" position="anchor" orientation="portrait"/></fig>
<p>The model incorrectly stated that ‘John’ is ‘too far removed’ to bind with ‘himself’ in (12). With (14), the model incorrectly states that ‘The intended reading is that Saul is expected to win on his own’. (14) can be read as Jimmy expecting Saul to win some potential prize, whereby the prize could be, e.g., some painting of Saul, whereby ‘Saul won himself’ would similarly mean ‘Saul won a painting of himself’.</p>
<p>These arguments also apply to (15), which the model incorrectly indentified as ungrammatical, even though Jimmy could, again, be expected to win some painting (or somesuch) of Saul (or, indeed, Saul himself could logically be the prize, e.g., ‘Saul’ could be the name of a pet or robot).</p>
<p>(16) has a dual reading, one under which ‘we’ and ‘each other’ are linked (ungrammatical) and one under which ‘they’ and ‘each other’ are linked (grammatical). The model failed to parse these possibilities.</p></sec>
<sec id="sec3.10"><title>3.10. Assessment of Graded Acceptability</title>
<p>Next, we followed up on the initial indications from Section 3.9 that the model succeeds in identifying ungrammatical sentences but struggles to reliably identify acceptable sentences as such. Instead of presenting only grammatical and ungrammatical sentences (as in Section 3.9), we exploited the gradient cline in acceptability in the constructions below (‘*’ = unacceptable; ‘?’ = partially acceptable for some) collated from some recent linguistics literature (<xref ref-type="bibr" rid="r1">Amiraz, 2022</xref>; <xref ref-type="bibr" rid="r37">Murphy, 2024a</xref>; <xref ref-type="bibr" rid="r59">Toosarvandani, 2014</xref>; <xref ref-type="bibr" rid="r61">Wu, 2025</xref>). Note that the ‘partial acceptability’ rating was motivated directly by prior literature, and was not arbitrarily stipulated by our group. 14 of these sentences were either unacceptable (1-4) or partially acceptable (5-14). We presented these sentences to o3-mini-high (without the below annotated ‘*’ or ‘?’) and asked it to sort them by acceptability. We provide an abridged prompt below, for reasons of space (the sentences were presented below this prompt text in randon order, without numbering).</p>
<disp-quote>
<p><bold>Prompt 23</bold></p>
	<p><?disable-indent?>Please sort the sentences below into increasing levels of acceptability: from (1) (wholly unacceptable) to (2) (unacceptable) to (3) (partially acceptable) to (4) (acceptable).</p></disp-quote>
<p>The model identified 7 sentences as unacceptable, only 2 sentences as partially acceptable, and 15 sentences as acceptable, diverging from the acceptability profile provided above. Below, we have marked sentences incorrectly judged by the model with a red cross, and those correctly judged with a green tick. If the model assigned a partially unacceptable sentence as (1) (‘wholly unacceptable’) and provided a reasonable explanation, we considered this to be correct and hence assigned it a green tick.</p>

	<fig id="f44" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f44" position="anchor" orientation="portrait"/></fig>
	
	<fig id="f33" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f33" position="anchor" orientation="portrait"/></fig>
	
	<fig id="f34" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f34" position="anchor" orientation="portrait"/></fig>
	
<p>Below is a summary chart for the accuracy of o3-mini-high across the different types of sentences it was tasked with rating (<italic>we again caveat this by highlighting the limited sample size in the present preliminary study</italic>).</p><fig id="f35" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 2</label><caption>
<title>Bar Chart Representing Accuracy for o3-Mini-High Across the Three Types of Sentences Provided in the Prompts</title></caption><graphic xlink:href="bioling.19021-f35" position="anchor" orientation="portrait"/></fig>
<p>The explanation for (14) incorrectly states that this is unacceptable due to more common expectations of the presence of ‘to raise children’ (we note that the model’s own numbering system in its response text seems inconsistent and flawed, so we refer to numbered items in our ‘Gradient Cline in Acceptability’ list above). The model was unable to recognize zeugmatic conceptual coordination as motivating inclusion into either the partially acceptable or unacceptable groups (i.e., (7)). Some of the explanations for the unacceptable sentences – though correctly identified as such – do not provide a coherent explanation for their unacceptable nature. For example, ‘Not three students arrived’ is deemed unacceptable purely because ‘it is odd and ungrammatical’—which raises the question as to why!</p>
<p>Importantly, we wish to stress that we provided to the model four distinct options for acceptability, which were not utilized correctly for some of the partially acceptable sentences – <italic>even when the model explicitly noted in its response that these were in fact not wholly acceptable</italic>. For example, two of the sentences that the model placed in the ‘Acceptable’ group are noted as being ‘odd’ and ‘unexpected’ – ideal criteria to motivate their inclusion into the ‘Partially Acceptable’ group.</p>
<p>Overall, the model succeeded in identifying the most egregiously unacceptable sentences (in both this section and in Section 3.9), and most of the plainly acceptable sentences. However, some of its explanations were either lacking in specificity or were inconsistent with the model’s grouping of the sentences in question. In addition, the model struggled considerably with partially acceptable sentences, classifying only two sentences as partially acceptable out of ten—and one of these two sentences was incorrectly classified (two of the partially acceptable sentences were classified as unacceptable with reasonable explanations, and so we deemed these to be correct judgments). As such, only one sentence out of ten was correctly placed within the ‘partially acceptable’ group. Therefore, we conclude that the kind of acceptability spectrum that humans are acutely sensitive to is not reliably captured by o3.</p></sec>
<sec id="sec3.11"><title>3.11. Modified Jabberwocky</title>
<p>In order to test the potential interaction of lexical and configurational processes, we presented the model with the following prompt.</p>
<disp-quote>
<p><bold>Prompt 24</bold></p>
	<p><?disable-indent?>Can you generate for me three ‘Jabberwocky’-style sentences which have the following properties: First, instead of replacing all content words with pseudowords (the typical way to implement a Jabberwocky sentence), I want you to replace all function words with pseudowords. The second sentence must contain a syntactic violation that must be detectible for English speakers. None of the pseudowords must rhyme with any other pseudoword across the three sentences. Finally, the three sentences must together form a coherent event structure.</p></disp-quote><fig id="f36" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f36" position="anchor" orientation="portrait"/></fig>
<p>Breaking down the four requests:</p>
<list id="L1" list-type="order">
<list-item>
<p><bold>Success</bold>: All function words were accurately replaced with pseudowords.</p></list-item>
<list-item>
<p><bold>Failure</bold>: The two neighboring pseudowords that are claimed to create “a clear syntactic violation” are not readily parsed as [Determiner, Determiner], since it is not necessarily ungrammatical to have two co-occurring pseudowords. For example, the sentence could readily be parsed as ‘The explorer followed with the map to a hidden grove’; or ‘across the map’, ‘within the map’, ‘in the map’, ‘on the map’, etc. The prompt requested that the syntactic violation must be detectible by English speakers – the model could have injected a syntactic violation that was more obviously marked on the content words.</p></list-item>
<list-item>
<p><bold>Failure</bold>: The pseudowords ‘flim’ and ‘krim’ rhyme.</p></list-item>
<list-item>
<p><bold>Success</bold>: A coherent narrative structure was provided.</p></list-item>
</list>
<p>Overall, the model was able to generate a series of narratively connected sentences and switch out all function words with pseudowords—operations that rely purely on <italic>lexical statistics, not structure</italic>. It failed with instructions that demanded a level of higher-order syntactic and even phonological inferences. Interestingly, by its own internal logic under which ‘lupn puxit map’ was inferred as the ungrammatical phrase ‘a the map’, the model was correct. But it was seemingly unable to check against other alternative parsings that would render this string of words grammatical. The model made it seem as if placing two pseudowords in the “wrong order” constitutes a syntactic violation since the English words that the model substituted them for would be ungrammatical. But of course, no English speaker would know that these pseudowords were transformed from specific function words.</p></sec>
<sec id="sec3.12"><title>3.12. Syntactic Superposition</title>
<p>The next prompt required the model to represent multiple syntactic violations within a single sentence, but to do so in a manner that nevertheless yielded some interpretable output. Though this is admittedly a difficult challenge, our motivation here was to expose the type of reasoning o3 exhibited when encountered with this challenge of negotiating two distinct syntactic rules in the service of some semantically-related goal.</p>
<disp-quote>
<p><bold>Prompt 25</bold></p>
	<p><?disable-indent?>Generate a list of 10 sentences that exhibit the following property: They all violate two different types of grammatical rules, but violating these two rules simultaneously yields a semantically or syntactically acceptable sentence. Each of the 10 sentences must combine different rule violations.</p></disp-quote><fig id="f37" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f37" position="anchor" orientation="portrait"/></fig><fig id="f38" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f38" position="anchor" orientation="portrait"/></fig><fig id="f39" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f39" position="anchor" orientation="portrait"/></fig><fig id="f40" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f40" position="anchor" orientation="portrait"/></fig><fig id="f41" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f41" position="anchor" orientation="portrait"/></fig>
<p>The explanation for Sentences 2-5 and 7-10 can be used as a justification for basically all ungrammatical sentences as to why they are ungrammatical. This justification boils down to ‘speakers can just choose to ignore this word’ or ‘some people stutter sometimes’. This is perfectly true, but it is hardly in full compliance with the prompt’s request for a sentence that is “semantically or syntactically acceptable”. Meanwhile, sentences 1 and 6 rely on non-standard forms of English. As such, the model in effect failed to generate a single example of two syntactic rules ‘cancelling out’ (in semantic space or configurational space) to yield some interpretable structure. Perhaps most importantly, the prompt required the model to “combine different rule violations”, yet the general theme of ‘redundancy’, ‘repetition’ and ‘superfluous’ elements cited by the model in its explanations ensured one general violation type became overwhelmingly dominant (i.e., simply repeating a word).</p></sec>
<sec id="sec3.13"><title>3.13. Impossible Objects</title>
<p>Inspired by sentences involving complex forms of polysemy (e.g., “Lunch was delicious but took forever”; “The newspaper on the table was sued by a millionaire”; “The White House issued a statement before being repainted”) involving the combination of categorially distinct semantic types (<xref ref-type="bibr" rid="r13">Gotham, 2017</xref>; <xref ref-type="bibr" rid="r36">Murphy, 2021</xref>, <xref ref-type="bibr" rid="r37">2024a</xref>), we generated the following prompt.</p>
<disp-quote>
<p><bold>Prompt 26</bold></p>
	<p><?disable-indent?>Some sentences involving polysemous words can yield semantically 'impossible' objects, like nouns that are simultaneously referred to as processes or events or concrete tokens. Generate five sentences that each involve reference to a different type of semantically impossible entity, but which is perfectly comprehensible to English speakers as not violating any rules of semantic composition or conceptual combination. In these sentences, you must only refer to the named entity once explicitly. In addition, each sentence must exhibit a different combination of multiple meanings being combined together.</p></disp-quote><fig id="f42" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f42" position="anchor" orientation="portrait"/></fig><fig id="f43" position="anchor" fig-type="figure" orientation="portrait"><graphic xlink:href="bioling.19021-f43" position="anchor" orientation="portrait"/></fig>
<p>From these responses, it seems clear that the model has no human-like sense of semantic anomaly. The model is correct that (3) can be interpreted as a piece of information and also a physical text, but the other examples fail to generate any coherent sense of impossibility. For example, it is not ‘semantically impossible’ for something concrete to have an emotional impact. In (2), the model’s intended meaning, of tree bark ‘resounding’, is still not triggering of an impossible entity. With (5) (‘The plot was buried beneath layers of mystery’), the model uses ‘buried’ as metaphorical, such that an abstract plot exhibits some relation to some abstract mystery, hence causing no impossibility. With (1), (4) and (5), the model seems to assume that ‘figurative’ and ‘playful’ meanings suffice to satisfy the prompt’s request for blending semantically distinct meanings.</p>
<p>In addition, the prompt requested ‘a different combination’ across all sentences, but the ‘concrete/physical’ sense was used every time (sometimes twice with one sentence, as in (2)). This task would have been easily achievable if the model had blended <italic>physical</italic>, <italic>event</italic>, <italic>information</italic> and <italic>institution</italic> senses in various ways—instead, it was only able to mix vaguely metaphorical meanings. Notice that, as with some previous prompts above (e.g., Prompt 24), here we gave the model a generous clue as to how to solve this problem, and yet it was still unable to do so.</p></sec>
<sec id="sec3.14"><title>3.14. Summary</title>
<p>We ran these above prompts 3 additional times in o3 via OpenAI’s API, and the accuracy closely reproduced the above effects that we found via the chat interface. In Supplementary Material we provide the full results, broken down by accuracy.</p></sec></sec>
<sec id="sec4" sec-type="discussion"><title>4. Discussion</title>
	<p>As predicted by some previous position papers and experimental reports (<xref ref-type="bibr" rid="r2">Baggio &amp; Murphy, 2024</xref>; <xref ref-type="bibr" rid="r18">Leivada et al., 2023a</xref>, <xref ref-type="bibr" rid="r20">2023b</xref>; <xref ref-type="bibr" rid="r26">Marcus, 2024</xref>), the latest sophisticated reasoning model from OpenAI (o3) falls short of demonstrating human-like expertise in compositional syntax-semantics. It fails to cleanly dissociate conceptual content from structural configuration – a basic requirement of compositional syntax (<xref ref-type="bibr" rid="r11">Evans, 1985</xref>; <xref ref-type="bibr" rid="r32">McCarty et al., 2023</xref>; <xref ref-type="bibr" rid="r40">Murphy, 2025</xref>) – and its provides surreal meanings instead of truly ungrammatical sentences. It was unable to generate a Jabberwocky structure that accurately represented a clear syntactic violation, it was unable to accurately assess the output of applying two distinct syntactic violations to a sentence, and it was unable to represent semantically impossible entities. Our results indicate that the kind of sentence acceptability spectrum that humans are acutely sensitive to (<xref ref-type="bibr" rid="r56">Sprouse &amp; Almeida, 2012</xref>) is not reliably captured by o3. Although we only provide minimal descriptive statistics over a brief sample size (with a more systematic investigation forthcoming), our prompts covered a broad range of grammatical demands, and indicate not only that large language models (LLMs) (like ChatGPT-4o and Large Reasoning Models like o3) have problems with ‘contextual’ and ‘pragmatic’ reasoning, but that they have not yet grasped formal language competence (in contrast to more optimistic assessments in <xref ref-type="bibr" rid="r23">Mahowald et al., 2024</xref>).</p>
<sec id="sec4.1"><title>4.1. Structure or Statistics?</title>
	<p>While <xref ref-type="bibr" rid="r3">Beguš et al. (2025)</xref> report that GPT-4 is capable of recognizing ambiguities, correcting its own analytical errors, and commenting on the feasibility of multiple solutions, we found that the more recent o3 model fails to achieve something much more elementary: It was unable to reliably distinguish between meaning and structure. When <xref ref-type="bibr" rid="r3">Beguš et al. (2025)</xref> focus on OpenAI’s o1 model, they claim that its “ability to construct center-embedded sentences without being explicitly prompted to do so thus suggests that the model acquired grammatical structure beyond the simple distributional tendencies of its training data set”. In contrast, our results cast a more pessimistic light on the grammatical capacities of o3, including explicitly for center-embedding.</p>
	<p>Moreover, our results (see especially Sections 3.6-3.13) help emphasize an apparent lack of meta-linguistic understanding (contra <xref ref-type="bibr" rid="r3">Beguš et al., 2025</xref>). For LLMs, language simply <italic>is</italic> the system it is trying to master, whereas for humans language is exploited as a powerful cognitive and inferential tool. Meta-linguistic understanding is only possible in principle if there is some separate cognitive/generative model or grounding in a world model that language is used to revise/update (<xref ref-type="bibr" rid="r20">Leivada et al., 2023b</xref>; <xref ref-type="bibr" rid="r25">Marcus, 2022</xref>). This does not seem to be the case for o3.</p>
<p>One caveat we wish to highlight here is the possibility that the model’s failure with drawing tree representations may simply be due to issues with interfacing with the drawing module itself, and may not necessarily be driven by issues in syntactic representation. Future work could attempt to have o3 output distinct types of configurational representations, perhaps via formalized languages that may be more approximate to native features of the model. A related caveat is that we have no direct human performance scores to directly makes claims about certain ‘human-level’ performance, which will be needed to make such comparisons.</p></sec>
<sec id="sec4.2"><title>4.2. Syntax or Salmon?</title>
<p>Our results support recent hypotheses concerning the ability of language models to represent ‘horizontal’ linguistic information, but their significantly reduced ability to represent ‘vertical’ types of hierarchical compositional syntax-semantics (<xref ref-type="bibr" rid="r38">Murphy, 2024b</xref>, <xref ref-type="bibr" rid="r39">2024c</xref>). Postulating a chain of uni-directional associations between elements (and only showing an ability to deal with mono-configurational assessments, rather than understanding the dynamic relationship between syntactic processes and variable semantic interpretations; i.e., Sections 3.7-3.10) does not entail grammatical understanding. The language system does not fly solo—it is always in the game of driving higher-order inferences, planning, consolidating experience, and aiding directed attention. As suggested by our results, o3-mini-high lacks an ability to handle syntactic inferences <italic>alongside</italic> cognitive model updating, given its clear inability to recognize the various ways in which semantic and syntactic representations dynamically interact. Numerous examples from our report illustrate this. For example, the semantically zeugmatic constructions ‘The salmon was fast and delicious’ and ‘My appointment was long and obnoxious’ were deemed felicitous. The model was likely heavily biased by the lexico-semantic statistics of these constructions rather than by the subtle ways in which the grammar regulates distinct coordinates in conceptual space that differ markedly from the same general meaning being configured in syntactically distinct ways (e.g., compare with ‘The salmon was fast and it was delicious’; <xref ref-type="bibr" rid="r36">Murphy, 2021</xref>).</p>
<p>Our results therefore indicate a strong bias for imposing ‘horizontal’ relations on the part of o3. Humans, in contrast, have a strong bias from an early age to impose hierarchical, compositional structure above and beyond linear relations (<xref ref-type="bibr" rid="r34">Murphy, 2020a</xref>; <xref ref-type="bibr" rid="r47">Perkins &amp; Lidz, 2021</xref>). As reviewed in <xref ref-type="bibr" rid="r39">Murphy (2024c)</xref>, LLMs seem able to capture certain features of dependencies (<xref ref-type="bibr" rid="r57">Tesnière, 1959</xref>), but other fundamental principles of language that regulate how constituency, headedness, and incremental node counts yield semantic instructions during parsing (via the <italic>mapping</italic> of syntactic objects to updates of cognitive models) remain somewhat elusive.</p></sec>
<sec id="sec4.3"><title>4.3. Reasoning or Rambling?</title>
	<p>Though it may represent an advance in “the boundaries of what small models can achieve, delivering exceptional STEM capabilities—with particular strength in science, math, and coding—all while maintaining the low cost and reduced latency of OpenAI o1-mini” (<xref ref-type="bibr" rid="r46">OpenAI, 2025</xref>), this most recent model nevertheless falls short in similar ways to previous models (<xref ref-type="bibr" rid="r9">Dentella et al., 2024</xref>; <xref ref-type="bibr" rid="r39">Murphy, 2024c</xref>). Our work expands on previous results exposing a stark absence of response stability in large language models (<xref ref-type="bibr" rid="r8">Dentella et al., 2023</xref>). Language models can assign probabilities to strings of words, but grammaticality cannot be construed as a phenomenon of transitional probability extracted from lexical items alone (<xref ref-type="bibr" rid="r21">Lenneberg, 1967</xref>). For this reason, recent advances that dispense with the notion of ‘tokenization’ altogether in favour of seeking ‘Large Concept Models’ grounded in semantic representations may potentially be more preferable in some cases (<xref ref-type="bibr" rid="r17">LCM Team et al., 2024</xref>).</p>
<p>Not only does the o3 model fall short in terms of providing a clear path towards artificial general intelligence (<xref ref-type="bibr" rid="r51">Pfister &amp; Jud, 2025</xref>), it also fails to demonstrate a robust grasp of some of the most fundamental elements of compositional linguistic structures. Our brief report provides further reasons for scepticism towards the claim from Microsoft that OpenAI’s recent models “[attain] a form of general intelligence” and show “sparks of artificial general intelligence” (<xref ref-type="bibr" rid="r5">Bubeck et al., 2023</xref>, p. 92). We find claims from the AI team at Apple more reasonable here: A recent assessment found no evidence of formal reasoning in language models, with the team concluding that their behavior is better explained by sophisticated pattern matching (<xref ref-type="bibr" rid="r28">Mirzadeh et al., 2024</xref>). Consulting some of the explanations for acceptability provided by o3 (e.g., Section 3.7-3.10) also reinforces the assessment that ChatGPT is a professional “bullshitter” (<xref ref-type="bibr" rid="r14">Hicks et al., 2024</xref>), “bloviator” and “a fluent spouter of bullshit” (<xref ref-type="bibr" rid="r27">Marcus &amp; Davis, 2020</xref>).</p>
<p>Interestingly, various advocates and proponents of LLMs have recently argued that linguists who claim that sentences such as ‘Dogs dogs dog dog dogs’ are grammatical are offering a psycholinguistically implausible and unhelpful theory of grammar. And yet, in a twist of irony, according to the present results the most advanced model from OpenAI does not appear to agree with this critique, and is seemingly so eager to attempt to parse these types of structures that it readily determines wholly <italic>ungrammatical</italic> cases (such as the “Glarts…” examples in Prompts 8-9) to be grammatical.</p>
<p>In some of our prompts requesting the generation of ungrammatical structures (Section 3.6) or the assessment of complex embedding (Section 3.4), we suspect that o3 was doubtless influenced by lexical statistics to a much greater extent than by any level of hidden states used to support (some format of) grammatical configuration (a bias already documented for text-to-image models; <xref ref-type="bibr" rid="r20">Leivada et al., 2023b</xref>). Yet, the task at hand was explicitly to invoke higher-order hierarchical representations and attempt to de-noise the relevant assessments from any influence from lexico-semantic statistics.</p></sec>
<sec id="sec4.4"><title>4.4. Theories or Tools?</title>
<disp-quote>
<p>“<italic>The best material model for a cat is another, or preferably the same cat”.</italic></p>
<p>– <xref ref-type="bibr" rid="r53">Rosenblueth and Wiener (1945)</xref></p></disp-quote>
	<p>The fact that o3 was unable to reliably generate basic violations of syntactic rules should motivate some degree of concern and scepticism towards claims that LLMs do better than linguists on every job that syntactic theory was intended to perform. <xref ref-type="bibr" rid="r64">Ambridge and Blything (2024)</xref> argue that “large language models are better than theoretical linguists at theoretical linguistics”—an assessment at odds with our discovery that the most sophisticated reasoning model from OpenAI deems a number of grammatical sentences to constitute violations of binding theory, amongst other things. As pointed out by others, it is also incoherent to claim that LLMs can directly constitute a “theory of language” (<xref ref-type="bibr" rid="r16">Katzir, 2023</xref>; <xref ref-type="bibr" rid="r31">Müller, 2024</xref>). This type of theory-nihilism (and data-ism) has been bolstered by the recent surge of interest in LLMs, but it has yet to be proven capable of being translated into a concrete scientific research program that can replace dominant theories of language acquisition and processing.</p>
<p>Although <xref ref-type="bibr" rid="r48">Piantadosi (2024)</xref> recently attempted to do to Chomsky what Chomsky did to Skinner in 1959 (i.e., refute his research enterprise and much of its philosophical basis), Piantadosi’s arguments proved to be flawed (<xref ref-type="bibr" rid="r16">Katzir, 2023</xref>)<xref ref-type="fn" rid="fn1"><sup>1</sup></xref><fn id="fn1"><label>1</label>
<p>See also a follow-up debate on this topic: “A conversation on large language models: Murphy &amp; Piantadosi”. ActInf GuestStream 041.1 (23 April 2023). <ext-link ext-link-type="uri" xlink:href="https://youtube.com/watch?v=EEyVd9d3D5U">https://youtube.com/watch?v=EEyVd9d3D5U</ext-link> </p></fn>. As pointed out already by <xref ref-type="bibr" rid="r7">Collins (2024)</xref>:</p>
<disp-quote>
<p>“The fundamental reason that LLMs cannot be scientific theories is <italic>not</italic> because they are probabilistic, or because they involve parameter tuning. Nor even does it have to do with their lack of human intelligibility. As Piantadosi notes, such things are common enough among mature sciences. Rather, the issue is that the repre- sentational capacities of LLMs (and their connectionist siblings) are <italic>unbounded</italic> in a way that makes their representations arbitrary”.</p></disp-quote>
<p>As a brief aside, it is worth highlighting in this context that it was the human brain during evolution that <italic>created</italic> syntactic structure (<xref ref-type="bibr" rid="r33">Murphy, 2019</xref>, <xref ref-type="bibr" rid="r35">2020b</xref>, <xref ref-type="bibr" rid="r39">2024c</xref>; <xref ref-type="bibr" rid="r45">Murphy et al., 2022</xref>, <xref ref-type="bibr" rid="r42">2023</xref>, <xref ref-type="bibr" rid="r44">2024b</xref>). LLMs, by contrast, being universal function approximators (<xref ref-type="bibr" rid="r62">Yun et al. 2019</xref>), are surely able to reproduce certain aspects of lexico-semantic statistics from the ‘fossilized’ remains of the human generative machine they recover from data (<xref ref-type="bibr" rid="r29">Mitchell &amp; Krakauer, 2023</xref>). But there are very plausible reasons to assume that whatever method LLMs use it bears little resemblance to the algorithms deployed by human infants (<xref ref-type="bibr" rid="r19">Leivada &amp; Murphy, 2022</xref>; <xref ref-type="bibr" rid="r41">Murphy et al., 2025</xref>), who deploy specialized knowledge rather than solely invoking general token-prediction algorithms. Due to LLMs being a universal approximation method, they are more akin to tools such as generalized Fourier series than scientific theories of human cognition. Relatedly, distributional semantics vectors can certainly be used as a <italic>proxy</italic> for natural language meanings, but they are not to be confused with “the stuff of thought” itself (<xref ref-type="bibr" rid="r50">Pinker, 2007</xref>). This is not even to mention related concerns that hover in the background, like the fact that the back propagation training algorithms used with LLMs are considerably different from human learning mechanisms (<xref ref-type="bibr" rid="r12">Evanson et al., 2023</xref>).</p></sec>
<sec id="sec4.5"><title>4.5. Design or Data?</title>
<p>Instead of scaling to unprecendented levels of compute via architectures that are fundamentally grounded in token prediction, a return to more traditional design features of the human mind (predicate-argument structure, variable binding, constituent structure, minimal compositional binding; <xref ref-type="bibr" rid="r10">Donatelli &amp; Koller, 2023</xref>) may be needed to orchestrate a more reliable expertise in human language (<xref ref-type="bibr" rid="r52">Ramchand 2024</xref>). This could be implemented by forms of neuro-symbolic approaches.</p>
<p>Still, it is also certainly true that mainstream theoretical linguistics (e.g., the minimalist enterprise) was in some ways ill-equipped to successfully predict which patterns of linguistic activity might be (un)approachable by LLMs. To illustrate, a potential weakness in this direction with respect to recent generative grammar theorizing has been the underestimation of the extent to which lexical information drives composition. This type of information may permit LLMs to abductively infer certain elements of grammatical rules, in whatever format this ultimately takes (<xref ref-type="bibr" rid="r52">Ramchand, 2024</xref>). Future research should more carefully apply the tools of linguistics to isolate specific sub-components of syntax that might be in principle achievable by language models, given specific design features. For instance, with LLMs “complete recovery of syntax might be very difficult computationally” (<xref ref-type="bibr" rid="r24">Marcolli et al., 2025</xref>, p. 13), even if we assume that attention modules can in principle “satisfy the same algebraic structure” as what Marcolli et al. postulate as being necessary for syntax-semantics interface mappings.</p>
<p>More broadly, the currently popular vector approach (<xref ref-type="bibr" rid="r49">Piantadosi et al., 2024</xref>) risks conflating the <italic>implementation medium</italic> with the <italic>computational level</italic>: for a high-dimensional vector space that supposedly encodes structured symbolic derivations, unless the structure is explicitly recoverable and manipulable then the representation is only functionally equivalent in a loose sense. How does a vector-based learner discover abstract, exceptionless rules without relying on statistical accident? <xref ref-type="bibr" rid="r49">Piantadosi et al. (2024)</xref>, and others, risk explaining compositionality post hoc (“it emerges in the geometry”) rather than as a necessary design property. And it is these necessary algebraic properties of language that linguistic theory tries to capture.</p></sec></sec>
<sec id="sec5" sec-type="conclusions"><title>5. Conclusion</title>
<p>In contrast to some recent claims that we may be living through “the end of (generative) linguistics as we know it” (<xref ref-type="bibr" rid="r6">Chesi, Forthcoming</xref>), our results should spur cognitive scientists, psychologists and philosophers to press even further into the reaches of algorithmic and psycholinguistic models of hierarchical syntactic composition. Some recent directions here come from exploiting concepts from statistical physics (<xref ref-type="bibr" rid="r43">Murphy et al., 2024a</xref>) to uncover previously unknown principles of language design (and to provide a potential meta-language to compare and quantify distinct syntactic theories), and from recent attempts to bridge symbolic theories of language with probabilistic-connectionist models of parsing (<xref ref-type="bibr" rid="r39">Murphy, 2024c</xref>) to offer a neurobiologically plausible infrastructure for syntactic inferences.</p>
<p>The goal here should not be to virtuously resist the era of big data from the safety of our theoretical models of syntax, but to learn how best to properly leverage computational methods – not in order to <italic>surrender</italic> to LLMs (<xref ref-type="bibr" rid="r48">Piantadosi, 2024</xref>) but to <italic>utilize</italic> them (<xref ref-type="bibr" rid="r60">van Rooij et al., 2024</xref>) to assess how statistical and symbolic representations interact during the acquisition and processing of language.</p>

</sec>
</body>
<back>
	
	<sec sec-type="ethics-statement">
		<title>Ethics Statement</title>
		<p>This work did not involve human subject data, and no use of generative AI (outside of the main experiments which directly probed ChatGPT) was involved in the preparation of this manuscript.</p>
	</sec>
	
<ref-list><title>References</title>
	
	<ref id="r64"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Ambridge</surname>, <given-names>B.</given-names></string-name>&amp; <string-name name-style="western"><surname>Blything</surname>, <given-names>L.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Large language models are better than theoretical linguists at theoretical linguistics.</article-title> <source>Theoretical Linguistics</source>, <volume>5</volume>(<issue>1-2</issue>), <fpage>33</fpage>-<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1515/tl-2024-2002</pub-id></mixed-citation></ref>
	
<ref id="r1"><mixed-citation publication-type="confproc">Amiraz, O. (2022). Not all scalar inferences are alike: the effect of existential presuppositions. In Degano, M., Roberts, T., Sbardolini, G., &amp; Schouwstra, M. (Eds.). <italic>Proceedings of the 2022 Amsterdam Colloquium,</italic> 23, 8–14.</mixed-citation></ref>
<ref id="r2"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Baggio</surname>, <given-names>G.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year>). <article-title>On the referential capacity of large language models.</article-title> </mixed-citation></ref>
<ref id="r3"><mixed-citation publication-type="other">Beguš, G., Dąbkowski, M., &amp; Rhodes, R. (2025). <italic>Large linguistic models: Analyzing theoretical linguistic abilities of LLMs</italic>. Lingbuzz. <ext-link ext-link-type="uri" xlink:href="https://lingbuzz.net/007269">https://lingbuzz.net/007269</ext-link></mixed-citation></ref>
<ref id="r4"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Besta</surname>, <given-names>M.</given-names></string-name>, <string-name name-style="western"><surname>Barth</surname>, <given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Schreiber</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Kubicek</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Catarino</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Gerstenberger</surname>, <given-names>R.</given-names></string-name>, <string-name name-style="western"><surname>Nyczyk</surname>, <given-names>P.</given-names></string-name>, <string-name name-style="western"><surname>Iff</surname>, <given-names>P.</given-names></string-name>, <string-name name-style="western"><surname>Li</surname>, <given-names>Y.</given-names></string-name>, <string-name name-style="western"><surname>Houliston</surname>, <given-names>S.</given-names></string-name>, <string-name name-style="western"><surname>Sternal</surname>, <given-names>T.</given-names></string-name>, <string-name name-style="western"><surname>Copik</surname>, <given-names>M.</given-names></string-name>, <string-name name-style="western"><surname>Kwaśniewski</surname>, <given-names>G.</given-names></string-name>, <string-name name-style="western"><surname>Müller</surname>, <given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Flis</surname>, <given-names>L.</given-names></string-name>, <string-name name-style="western"><surname>Erberhard</surname>, <given-names>H.</given-names></string-name>, <string-name name-style="western"><surname>Chen</surname>, <given-names>U.</given-names></string-name>, <string-name name-style="western"><surname>Niewiadomski</surname>, <given-names>H.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Hoefler</surname>, <given-names>T.</given-names></string-name></person-group> (<year>2025</year>). <article-title>Reasoning language models: A blueprint.</article-title> </mixed-citation></ref>
<ref id="r5"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Bubeck</surname>, <given-names>S.</given-names></string-name>, <string-name name-style="western"><surname>Chandrasekaran</surname>, <given-names>V.</given-names></string-name>, <string-name name-style="western"><surname>Eldan</surname>, <given-names>R.</given-names></string-name>, <string-name name-style="western"><surname>Gehrke</surname>, <given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Horvitz</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Kamar</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Lee</surname>, <given-names>P.</given-names></string-name>, <string-name name-style="western"><surname>Lee</surname>, <given-names>Y. T.</given-names></string-name>, <string-name name-style="western"><surname>Li</surname>, <given-names>Y.</given-names></string-name>, <string-name name-style="western"><surname>Lundberg</surname>, <given-names>S.</given-names></string-name>, <string-name name-style="western"><surname>Nori</surname>, <given-names>H.</given-names></string-name>, <string-name name-style="western"><surname>Palangi</surname>, <given-names>H.</given-names></string-name>, <string-name name-style="western"><surname>Ribeiro</surname>, <given-names>M. T.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Zhang</surname>, <given-names>Y.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Sparks of artificial general intelligence: Early experiments with GPT-4.</article-title> </mixed-citation></ref>
<ref id="r6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Chesi</surname>, <given-names>C.</given-names></string-name></person-group> (<comment>Forthcoming</comment>). <article-title>Is it the end of (generative) linguistics as we know it?</article-title> <source>Italian Journal of Linguistics</source>.</mixed-citation></ref>
<ref id="r7"><mixed-citation publication-type="web">Collins, J. (2024). <italic>The simple reason LLMs are not scientific models (and what the alternative is for linguistics).</italic> Lingbuzz. <ext-link ext-link-type="uri" xlink:href="https://lingbuzz.net/008026">https://lingbuzz.net/008026</ext-link></mixed-citation></ref>
<ref id="r8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Dentella</surname>, <given-names>V.</given-names></string-name>, <string-name name-style="western"><surname>Günther</surname>, <given-names>F.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Leivada</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Systematic testing of three Language Models reveals low language accuracy, absence of response stability, and a yes-response bias.</article-title> <source>Proceedings of the National Academy of Sciences of the United States of America</source>, <volume>120</volume>(<issue>51</issue>), <elocation-id>e2309583120</elocation-id>. <pub-id pub-id-type="doi">10.1073/pnas.2309583120</pub-id><pub-id pub-id-type="pmid">38091290</pub-id></mixed-citation></ref>
<ref id="r9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Dentella</surname>, <given-names>V.</given-names></string-name>, <string-name name-style="western"><surname>Günther</surname>, <given-names>F.</given-names></string-name>, <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Marcus</surname>, <given-names>G.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Leivada</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Testing AI on language comprehension tasks reveals insensitivity to underlying meaning.</article-title> <source>Scientific Reports</source>, <volume>14</volume>, <elocation-id>28083</elocation-id>. <pub-id pub-id-type="doi">10.1038/s41598-024-79531-8</pub-id><pub-id pub-id-type="pmid">39543236</pub-id></mixed-citation></ref>
<ref id="r10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Donatelli</surname>, <given-names>L.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Koller</surname>, <given-names>A.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Compositionality in computational linguistics.</article-title> <source>Annual Review of Linguistics</source>, <volume>9</volume>, <fpage>463</fpage>–<lpage>481</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-linguistics-030521-044439</pub-id></mixed-citation></ref>
<ref id="r11"><mixed-citation publication-type="book">Evans, G. (1985). Semantic theory and tacit knowledge. In <italic>Collected Papers</italic> (pp. 322–342). Oxford University Press.</mixed-citation></ref>
<ref id="r12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Evanson</surname>, <given-names>L.</given-names></string-name>, <string-name name-style="western"><surname>Lakretz</surname>, <given-names>Y.</given-names></string-name>, &amp; <string-name name-style="western"><surname>King</surname>, <given-names>J. R.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Language acquisition: Do children and language models follow similar learning stages?</article-title> <source>Findings of the Association for Computational Linguistics: ACL</source>, <volume>2023</volume>, <fpage>12205</fpage>–<lpage>12218</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.773</pub-id></mixed-citation></ref>
<ref id="r13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Gotham</surname>, <given-names>M.</given-names></string-name></person-group> (<year>2017</year>). <article-title>Composing criteria of individuation in copredication.</article-title> <source>Journal of Semantics</source>, <volume>34</volume>(<issue>2</issue>), <fpage>333</fpage>–<lpage>371</lpage>.</mixed-citation></ref>
<ref id="r14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Hicks</surname>, <given-names>M. T.</given-names></string-name>, <string-name name-style="western"><surname>Humphries</surname>, <given-names>J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Slater</surname>, <given-names>J.</given-names></string-name></person-group> (<year>2024</year>). <article-title>ChatGPT is bullshit.</article-title> <source>Ethics and Information Technology</source>, <volume>26</volume>(<issue>2</issue>), <fpage>1</fpage>–<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1007/s10676-024-09775-5</pub-id></mixed-citation></ref>
<ref id="r15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Kamath</surname>, <given-names>G.</given-names></string-name>, <string-name name-style="western"><surname>Schuster</surname>, <given-names>S.</given-names></string-name>, <string-name name-style="western"><surname>Vajjala</surname>, <given-names>S.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Reddy</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Scope ambiguities in large language models.</article-title> <source>Transactions of the Association for Computational Linguistics</source>, <volume>12</volume>, <fpage>738</fpage>–<lpage>754</lpage>. <pub-id pub-id-type="doi">10.1162/tacl_a_00670</pub-id></mixed-citation></ref>
<ref id="r16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Katzir</surname>, <given-names>R.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Why large language models are poor theories of human linguistic cognition: A reply to Piantadosi.</article-title> <source>Biolinguistics</source>, <volume>17</volume>, <elocation-id>e13153</elocation-id>. <pub-id pub-id-type="doi">10.5964/bioling.13153</pub-id></mixed-citation></ref>
<ref id="r17"><mixed-citation publication-type="other">LCM Team, Barrault, L., Duquenne, P., Elbayad, M., Kozhevnikov, A., Alastruey, B., Andrews, P., Coria, M., Couairon, G., Costa-jussà, M. R., Dale, D., Elsahar, H., Heffernan, K., Janeiro, J. M., Tran, T., Ropers, C., Sánchez, E., San Roman, R., Mourachko, A., … &amp; Schwenk, H. (2024). <italic>Large Concept Models: Language modeling in a sentence representation space</italic>. Meta AI.</mixed-citation></ref>
<ref id="r18"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Leivada</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Dentella</surname>, <given-names>V.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2023</year><comment>a</comment>). <article-title>The quo vadis of the relationship between language and large language models.</article-title> </mixed-citation></ref>
<ref id="r19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Leivada</surname>, <given-names>E.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2022</year>). <article-title>A demonstration of the uncomputability of parametric models of language acquisition and a biologically plausible alternative.</article-title> <source>Language Development Research</source>, <volume>2</volume>(<issue>1</issue>), <fpage>105</fpage>–<lpage>138</lpage>.</mixed-citation></ref>
<ref id="r20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Leivada</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Marcus</surname>, <given-names>G.</given-names></string-name></person-group> (<year>2023</year><comment>b</comment>). <article-title>DALL·E 2 fails to reliably capture common syntactic processes.</article-title> <source>Social Sciences &amp; Humanities Open</source>, <volume>8</volume>(<issue>1</issue>), <elocation-id>100648</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.ssaho.2023.100648</pub-id></mixed-citation></ref>
<ref id="r21"><mixed-citation publication-type="book">Lenneberg, E. H. (1967). <italic>Biological foundations of language</italic>. John Wiley &amp; Sons.</mixed-citation></ref>
<ref id="r22"><mixed-citation publication-type="thesis">Lindström, A. D. (2024). <italic>Learning, reasoning, and compositional generalisation in Multimodal Language Models</italic> [PhD thesis]. Umeå University.</mixed-citation></ref>
<ref id="r23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Mahowald</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Ivanova</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Blank</surname>, <given-names>I. A.</given-names></string-name>, <string-name name-style="western"><surname>Kanwisher</surname>, <given-names>N.</given-names></string-name>, <string-name name-style="western"><surname>Tenenbaum</surname>, <given-names>J. B.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Fedorenko</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Dissociating language and thought in Large Language Models: A cognitive perspective.</article-title> <source>Trends in Cognitive Sciences</source>, <volume>28</volume>(<issue>6</issue>), <fpage>517</fpage>–<lpage>540</lpage>. <pub-id pub-id-type="doi">10.1016/j.tics.2024.01.011</pub-id><pub-id pub-id-type="pmid">38508911</pub-id></mixed-citation></ref>
<ref id="r24"><mixed-citation publication-type="book">Marcolli, M., Chomsky, N., &amp; Berwick, R. C. (2025). <italic>Mathematical structure of syntactic merge: An algebraic model for generative linguistics</italic>. MIT Press.</mixed-citation></ref>
<ref id="r25"><mixed-citation publication-type="other">Marcus, G. (2022, March 10). <italic>Deep learning is hitting a wall</italic>. Nautilu.</mixed-citation></ref>
<ref id="r26"><mixed-citation publication-type="book">Marcus, G. (2024). <italic>Taming Silicon Valley: How we can ensure that AI works for us.</italic> MIT Press.</mixed-citation></ref>
<ref id="r27"><mixed-citation publication-type="other">Marcus, G., &amp; Davis, E. (2020, August 22). <italic>GPT-3, bloviator: OpenAI’s language generator has no idea what it’s talking about</italic>. MIT Technology Review.</mixed-citation></ref>
<ref id="r28"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Mirzadeh</surname>, <given-names>I.</given-names></string-name>, <string-name name-style="western"><surname>Alizadeh</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Shahrokhi</surname>, <given-names>H.</given-names></string-name>, <string-name name-style="western"><surname>Tuzel</surname>, <given-names>O.</given-names></string-name>, <string-name name-style="western"><surname>Bengio</surname>, <given-names>S.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Farajtabar</surname>, <given-names>M.</given-names></string-name></person-group> (<year>2024</year>). <article-title>GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models.</article-title> </mixed-citation></ref>
<ref id="r29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Mitchell</surname>, <given-names>M.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Krakauer</surname>, <given-names>D.</given-names></string-name></person-group> (<year>2023</year>). <article-title>The debate over understanding in AI’s large language models.</article-title> <source>Proceedings of the National Academy of Sciences of the United States of America</source>, <volume>120</volume>(<issue>13</issue>), <elocation-id>e2215907120</elocation-id>. <pub-id pub-id-type="doi">10.1073/pnas.2215907120</pub-id><pub-id pub-id-type="pmid">36943882</pub-id></mixed-citation></ref>
<ref id="r30"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Mollica</surname>, <given-names>F.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Piantadosi</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2022</year>). <article-title>Meaning without reference in large language models.</article-title> </mixed-citation></ref>
<ref id="r31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Müller</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Large language models: The best linguistic theory, a wrong linguistic theory, or no linguistic theory at all.</article-title> <source>Zeitschrift für Sprachwissenschaft</source>, <volume>44</volume>(<issue>1</issue>). <pub-id pub-id-type="doi">10.18148/zs/2025-2001</pub-id></mixed-citation></ref>
<ref id="r32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>McCarty</surname>, <given-names>M. J.</given-names></string-name>, <string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Scherschligt</surname>, <given-names>X.</given-names></string-name>, <string-name name-style="western"><surname>Woolnough</surname>, <given-names>O.</given-names></string-name>, <string-name name-style="western"><surname>Morse</surname>, <given-names>C. W.</given-names></string-name>, <string-name name-style="western"><surname>Snyder</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Mahon</surname>, <given-names>B. Z.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Tandon</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Intraoperative cortical localization of music and language reveals signatures of structural complexity in posterior temporal cortex.</article-title> <source>iScience</source>, <volume>26</volume>, <elocation-id>107223</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.isci.2023.107223</pub-id><pub-id pub-id-type="pmid">37485361</pub-id></mixed-citation></ref>
<ref id="r33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2019</year>). <article-title>No country for Oldowan men: Emerging factors in language evolution.</article-title> <source>Frontiers in Psychology</source>, <volume>10</volume>, <elocation-id>1448</elocation-id>. <pub-id pub-id-type="doi">10.3389/fpsyg.2019.01448</pub-id><pub-id pub-id-type="pmid">31275219</pub-id></mixed-citation></ref>
<ref id="r34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2020</year><comment>a</comment>). <article-title>Language design and communicative competence: the minimalist perspective.</article-title> <source>Glossa: A Journal of General Linguistics</source><italic>,</italic> <volume>5</volume>(<issue>1</issue>), <elocation-id>2</elocation-id>. <pub-id pub-id-type="doi">10.5334/gjgl.1081</pub-id></mixed-citation></ref>
<ref id="r35"><mixed-citation publication-type="book">Murphy, E. (2020b). <italic>The oscillatory nature of language</italic>. Cambridge University Press.</mixed-citation></ref>
<ref id="r36"><mixed-citation publication-type="thesis">Murphy, E. (2021). <italic>Linguistic representation and processing of copredication</italic> [PhD thesis]. University College London.</mixed-citation></ref>
<ref id="r37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year><comment>a</comment>). <article-title>Predicate order and coherence in copredication.</article-title> <source>Inquiry</source>, <volume>67</volume>(<issue>6</issue>), <fpage>1744</fpage>–<lpage>1780</lpage>. <pub-id pub-id-type="doi">10.1080/0020174X.2021.1958054</pub-id></mixed-citation></ref>
<ref id="r38"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year><comment>b</comment>). <article-title>ROSE: A neurocomputational architecture for syntax.</article-title> <source>Journal of Neurolinguistics</source>, <volume>70</volume>, <elocation-id>101180</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.jneuroling.2023.101180</pub-id></mixed-citation></ref>
<ref id="r39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year><comment>c</comment>). <article-title>ROSE: A Universal Neural Grammar.</article-title> <source>Cognitive Neuroscience</source>. <comment>Advance online publication</comment>. <pub-id pub-id-type="doi">10.1080/17588928.2025.2523875</pub-id><pub-id pub-id-type="pmid">40653898</pub-id></mixed-citation></ref>
<ref id="r40"><mixed-citation publication-type="book">Murphy, E. (2025). The nature of language and the structure of reality. In Benítez-Burraco, A., López, I.F., Férnandez-Pérez, M., &amp; Ivanova, O. (Eds.), <italic>Biolinguistics at the cutting edge: Promises, achievements, and challenges</italic> (207–236) De Gruyter Mouton.</mixed-citation></ref>
<ref id="r41"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>de Villiers</surname>, <given-names>J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Morales</surname>, <given-names>S. L.</given-names></string-name></person-group> (<year>2025</year>). <article-title>A comparative investigation into compositional syntax and semantics in DALL⋅E and young children.</article-title> <source>Social Sciences &amp; Humanities Open</source>, <volume>11</volume>, <elocation-id>101332</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.ssaho.2025.101332</pub-id></mixed-citation></ref>
<ref id="r42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Forseth</surname>, <given-names>K. J.</given-names></string-name>, <string-name name-style="western"><surname>Donos</surname>, <given-names>C.</given-names></string-name>, <string-name name-style="western"><surname>Snyder</surname>, <given-names>K. M.</given-names></string-name>, <string-name name-style="western"><surname>Rollo</surname>, <given-names>P. S.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Tandon</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2023</year>). <article-title>The spatiotemporal dynamics of semantic integration in the human brain.</article-title> <source>Nature Communications</source>, <volume>14</volume>, <elocation-id>6336</elocation-id>. <pub-id pub-id-type="doi">10.1038/s41467-023-42087-8</pub-id><pub-id pub-id-type="pmid">37875526</pub-id></mixed-citation></ref>
<ref id="r43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Holmes</surname>, <given-names>E.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Friston</surname>, <given-names>K.</given-names></string-name></person-group> (<year>2024</year><comment>a</comment>). <article-title>Natural language syntax complies with the free-energy principle.</article-title> <source>Synthese</source>, <volume>203</volume>, <elocation-id>154</elocation-id>. <pub-id pub-id-type="doi">10.1007/s11229-024-04566-3</pub-id><pub-id pub-id-type="pmid">38706520</pub-id></mixed-citation></ref>
<ref id="r44"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Rollo</surname>, <given-names>P. S.</given-names></string-name>, <string-name name-style="western"><surname>Segaert</surname>, <given-names>K.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Hagoort</surname>, <given-names>P.</given-names></string-name></person-group> (<year>2024</year><comment>b</comment>). <article-title>Multiple dimensions of syntactic structure are resolved earliest in posterior temporal cortex.</article-title> <source>Progress in Neurobiology</source>, <volume>241</volume>, <elocation-id>102669</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.pneurobio.2024.102669</pub-id><pub-id pub-id-type="pmid">39332803</pub-id></mixed-citation></ref>
<ref id="r45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Murphy</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Woolnough</surname>, <given-names>O.</given-names></string-name>, <string-name name-style="western"><surname>Rollo</surname>, <given-names>P. S.</given-names></string-name>, <string-name name-style="western"><surname>Roccaforte</surname>, <given-names>Z.</given-names></string-name>, <string-name name-style="western"><surname>Segaert</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Hagoort</surname>, <given-names>P.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Tandon</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2022</year>). <article-title>Minimal phrase composition revealed by intracranial recordings.</article-title> <source>The Journal of Neuroscience</source>, <volume>42</volume>(<issue>15</issue>), <fpage>3216</fpage>–<lpage>3227</lpage>. <pub-id pub-id-type="doi">10.1523/JNEUROSCI.1575-21.2022</pub-id><pub-id pub-id-type="pmid">35232761</pub-id></mixed-citation></ref>
<ref id="r46"><mixed-citation publication-type="web">OpenAI. (2025, January 31). <italic>OpenAI o3-mini</italic>. <ext-link ext-link-type="uri" xlink:href="https://openai.com">https://openai.com</ext-link></mixed-citation></ref>
<ref id="r47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Perkins</surname>, <given-names>L.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Lidz</surname>, <given-names>J.</given-names></string-name></person-group> (<year>2021</year>). <article-title>Eighteen-month-old infants represent nonlocal syntactic dependencies.</article-title> <source>Proceedings of the National Academy of Sciences of the United States of America</source>, <volume>118</volume>(<issue>41</issue>), <elocation-id>e2026469118</elocation-id>. <pub-id pub-id-type="doi">10.1073/pnas.2026469118</pub-id><pub-id pub-id-type="pmid">34607945</pub-id></mixed-citation></ref>
<ref id="r48"><mixed-citation publication-type="book">Piantadosi, S. T. (2024). Modern language models refute Chomsky’s approach to language. In Gibson, E., &amp; Poliak, M. (Eds.), <italic>From Fieldwork to Linguistic Theory: A Tribute to Daniel Everett</italic> (pp. 353–414). Language Science Press.</mixed-citation></ref>
<ref id="r49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Piantadosi</surname>, <given-names>S. T.</given-names></string-name>, <string-name name-style="western"><surname>Muller</surname>, <given-names>D. C. Y.</given-names></string-name>, <string-name name-style="western"><surname>Rule</surname>, <given-names>J. S.</given-names></string-name>, <string-name name-style="western"><surname>Kaushik</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Gorenstein</surname>, <given-names>M.</given-names></string-name>, <string-name name-style="western"><surname>Leib</surname>, <given-names>E. R.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Sanford</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Why concepts are (probably) vectors.</article-title> <source>Trends in Cognitive Sciences</source>, <volume>28</volume>(<issue>9</issue>), <fpage>844</fpage>–<lpage>856</lpage>. <pub-id pub-id-type="doi">10.1016/j.tics.2024.06.011</pub-id><pub-id pub-id-type="pmid">39112125</pub-id></mixed-citation></ref>
<ref id="r50"><mixed-citation publication-type="book">Pinker, S. (2007). <italic>The stuff of thought: Language as a window into human nature</italic>. Penguin.</mixed-citation></ref>
<ref id="r51"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Pfister</surname>, <given-names>R.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Jud</surname>, <given-names>H.</given-names></string-name></person-group> (<year>2025</year>). <article-title>Understanding and benchmarking artificial general intelligence: OpenAI’s o3 is not AGI.</article-title> </mixed-citation></ref>
<ref id="r52"><mixed-citation publication-type="web">Ramchand, G. (2024). <italic>On LLMs, generative grammar, and how we need theory more than ever</italic>. Lingbuzz. <ext-link ext-link-type="uri" xlink:href="https://lingbuzz.net/008643">https://lingbuzz.net/008643</ext-link></mixed-citation></ref>
<ref id="r53"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Rosenblueth</surname>, <given-names>A.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Wiener</surname>, <given-names>N.</given-names></string-name></person-group> (<year>1945</year>). <article-title>The role of models in science.</article-title> <source>Philosophy of Science</source>, <volume>12</volume>(<issue>4</issue>), <fpage>316</fpage>–<lpage>321</lpage>. <pub-id pub-id-type="doi">10.1086/286874</pub-id></mixed-citation></ref>
<ref id="r54"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Russin</surname>, <given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>McGrath</surname>, <given-names>S. W.</given-names></string-name>, <string-name name-style="western"><surname>Williams</surname>, <given-names>D. J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Elber-Dorozko</surname>, <given-names>L.</given-names></string-name></person-group> (<year>2024</year>). <article-title>From Frege to ChatGPT: compositionality in language, cognition, and deep neural networks.</article-title> </mixed-citation></ref>
<ref id="r55"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Schaeffer</surname>, <given-names>R.</given-names></string-name>, <string-name name-style="western"><surname>Miranda</surname>, <given-names>B.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Koyejo</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Are emergent abilities of large language models a mirage?</article-title> </mixed-citation></ref>
<ref id="r56"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Sprouse</surname>, <given-names>J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Almeida</surname>, <given-names>D.</given-names></string-name></person-group> (<year>2012</year>). <article-title>Assessing the reliability of textbook data in syntax: Adger’s Core Syntax.</article-title> <source>Journal of Linguistics</source>, <volume>48</volume>, <fpage>609</fpage>–<lpage>652</lpage>. <pub-id pub-id-type="doi">10.1017/S0022226712000011</pub-id></mixed-citation></ref>
<ref id="r57"><mixed-citation publication-type="book">Tesnière, L. (1959). <italic>Eléments de Syntaxe Structurale</italic>. Librairie C. Klincksieck. Republished as <italic>Elements of Structural Syntax</italic>. Translated by Timothy Osborne and Sylvain Kahane. John Benjamins.</mixed-citation></ref>
<ref id="r58"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Tjuatja</surname>, <given-names>L.</given-names></string-name>, <string-name name-style="western"><surname>Neubig</surname>, <given-names>G.</given-names></string-name>, <string-name name-style="western"><surname>Linzen</surname>, <given-names>T.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Hao</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2024</year>). <article-title>What goes into a LM acceptability judgment? Rethinking the impact of frequency and length.</article-title> </mixed-citation></ref>
<ref id="r59"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Toosarvandani</surname>, <given-names>M.</given-names></string-name></person-group> (<year>2014</year>). <article-title>Contrast and the structure of discourse.</article-title> <source>Semantics and Pragmatics</source>, <volume>7</volume>, <elocation-id>4</elocation-id>. <pub-id pub-id-type="doi">10.3765/sp.7.4</pub-id></mixed-citation></ref>
<ref id="r60"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>van Rooij</surname>, <given-names>I.</given-names></string-name>, <string-name name-style="western"><surname>Guest</surname>, <given-names>O.</given-names></string-name>, <string-name name-style="western"><surname>Adolfi</surname>, <given-names>F.</given-names></string-name>, <string-name name-style="western"><surname>de Haan</surname>, <given-names>R.</given-names></string-name>, <string-name name-style="western"><surname>Kolokova</surname>, <given-names>A.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Rich</surname>, <given-names>P.</given-names></string-name></person-group> (<year>2024</year>). <article-title>Reclaiming AI as a theoretical tool for cognitive science.</article-title> <source>Computational Brain &amp; Behavior</source>, <volume>7</volume>, <fpage>616</fpage>–<lpage>636</lpage>. <pub-id pub-id-type="doi">10.1007/s42113-024-00217-5</pub-id></mixed-citation></ref>
<ref id="r61"><mixed-citation publication-type="web">Wu, D. (2025). <italic>Constituent negation requires entailment of an alternative</italic>. Lingbuzz. <ext-link ext-link-type="uri" xlink:href="https://lingbuzz.net/008781">https://lingbuzz.net/008781</ext-link></mixed-citation></ref>
<ref id="r62"><mixed-citation publication-type="preprint"><person-group person-group-type="author"><string-name name-style="western"><surname>Yun</surname>, <given-names>C.</given-names></string-name>, <string-name name-style="western"><surname>Bhojanapalli</surname>, <given-names>S.</given-names></string-name>, <string-name name-style="western"><surname>Rawat</surname>, <given-names>A. S.</given-names></string-name>, <string-name name-style="western"><surname>Reddi</surname>, <given-names>S. J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Kumar</surname>, <given-names>S.</given-names></string-name></person-group> (<year>2019</year>). <article-title>Are transformers universal approximators of sequence-to-sequence functions?</article-title> </mixed-citation></ref>
<ref id="r63"><mixed-citation publication-type="confproc">Zhao, J., &amp; Zhang, X. (2024). Large Language Model is not a (multilingual) compositional relation reasoner. <italic>Proceedings of the First Conference on Language Modeling</italic>. Philadelphia, United States.</mixed-citation></ref>
</ref-list>
	<sec sec-type="data-availability" id="das"><title>Data Availability</title>
		<p>Prompts and responses from o3 are available in Supplementary Materials (see <xref ref-type="bibr" rid="sp1_r1">Murphy et al., 2025</xref>).</p>
	</sec>	

	
	
	
	<sec sec-type="supplementary-material" id="sp1"><title>Supplementary Materials</title>
		<p>For this article, prompts and responses from o3 are available as Supplementary Materials (see <xref ref-type="bibr" rid="sp1_r1">Murphy et al., 2025</xref>).</p>
		<ref-list content-type="supplementary-material" id="suppl-ref-list">
			<ref id="sp1_r1">
				<mixed-citation publication-type="supplementary-material">
					<person-group person-group-type="author">
							<name name-style="western">
								<surname>Murphy</surname>
								<given-names>E.</given-names>
							</name>
							<name name-style="western">
								<surname>Leivada</surname>
								<given-names>E.</given-names>
							</name>
							<name name-style="western">
								<surname>Dentella</surname>
								<given-names>V.</given-names>
							</name>
							<name name-style="western">
								<surname>Montero</surname>
								<given-names>R.</given-names>
							</name>
							<name name-style="western">
								<surname>Günther</surname>
								<given-names>F.</given-names>
							</name>
							<name name-style="western">
								<surname>Marcus</surname>
								<given-names>G.</given-names>
							</name>
					</person-group> (<year>2025</year>). <source>Supplementary materials to "Fundamental principles of linguistic structure are not represented by ChatGPT"</source> <comment>[Data]</comment>. <publisher-name>PsychOpen GOLD</publisher-name>. <pub-id pub-id-type="doi" xlink:href="https://doi.org/10.23668/psycharchives.21439">10.23668/psycharchives.21439</pub-id>		
				</mixed-citation>
			</ref>
		</ref-list>
	</sec>
			

<fn-group>
<fn fn-type="financial-disclosure"><p>The authors have no funding to report.</p></fn>
</fn-group>
<fn-group>
<fn fn-type="conflict"><p>The authors have declared that no competing interests exist.</p></fn>
</fn-group>
<ack>
<p>The authors have no additional (i.e., non-financial) support to report.</p>
</ack>
</back>
</article>
