main.tex

%%
%% This is file `sample-manuscript.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx  (with options: `all,proceedings,bibtex,manuscript')
%% 
%% IMPORTANT NOTICE:
%% 
%% For the copyright see the source file.
%% 
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-manuscript.tex.
%% 
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%% 
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass
%% command.
%%
%% For submission and review of your manuscript please change the
%% command to \documentclass[manuscript, screen, review]{acmart}.
%%
%% When submitting camera ready or to TAPS, please change the command
%% to \documentclass[sigconf]{acmart} or whichever template is required
%% for your publication.
%%
%%
\documentclass[sigconf,screen,review,anonymous]{acmart}

\usepackage{caption}
\usepackage[utf8]{inputenc}
\usepackage[capitalize, noabbrev]{cleveref}
\usepackage{xcolor}
\usepackage{enumitem}

\usepackage{pgfplots}
\usepackage{listings}
\usepackage{subcaption}

\lstdefinestyle{prompt}{
  basicstyle=\small,
  breaklines=true,
  breakindent=0pt,
  frame=single,
  escapeinside={\%*}{*)},  % if you want to add LaTeX within your code
}

\lstset{
  basicstyle=\small, % or \footnotesize, \scriptsize, etc.
  % other settings
}

\newcommand{\highlight}[1]{\textcolor{blue}{\textbf{#1}}}

% commands for the findings box
\newcommand{\findingsbox}[1]{%
  \vspace{8pt}%
  \noindent%
  \setlength{\fboxsep}{-0.25pt}% 
  \setlength{\fboxrule}{0pt}%
  \fbox{%
  \setlength{\fboxrule}{0.25pt}%
    \fcolorbox{gray!90}{gray!10}{%
      \parbox{\dimexpr\columnwidth-2\fboxsep-2\fboxrule}{%
        {\color{gray!90}\vrule width 6pt}%
        \hspace{10pt}%
        \parbox{\dimexpr\columnwidth-4\fboxsep-4\fboxrule-29pt}{%
          \color{black}% 
          \vspace{8pt}% 
          #1%
          \vspace{8pt}%
        }%
      }%
    }%
  }%
}   


%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
  \providecommand\BibTeX{{%
    Bib\TeX}}}

%% Rights management information.  This information is sent to you
%% when you complete the rights form.  These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.
\setcopyright{acmlicensed}
\copyrightyear{2024}
\acmYear{2024}
\acmDOI{XXXXXXX.XXXXXXX}

%% These commands are for a PROCEEDINGS abstract or paper.
\acmConference[FSE '25]{33rd ACM Symposium on the Foundations of Software Engineering}{June 23--27,
   2025}{Trondheim, Norway}
%%
%%  Uncomment \acmBooktitle if the title of the proceedings is different
%%  from ``Proceedings of ...''!
%%
% \acmBooktitle{Companion Proceedings of the 33rd ACM Symposium on the Foundations of Software Engineering (FSE '25), June 23--27, 2025, Trondheim, Norway}
% \acmISBN{978-1-4503-XXXX-X/18/06}


%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}

%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%

%%
%% The majority of ACM publications use numbered citations and
%% references.  The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}


%%
%% end of the preamble, start of the body of the document source.
\begin{document}

%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
% \title{Automated Feedback Loops: Enhancing Student Motivation and Performance in Programming Courses}
\title{Direct Automated Feedback Delivery for Student Submissions based on LLMs}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.


\author{Maximilian Sölch}
\email{maximilian.soelch@tum.de}
\orcid{0009-0004-1509-7842} 
\affiliation{%
	\institution{Technical University of Munich}
	\city{Munich}
	\country{Germany}
}

\author{Felix T.J. Dietrich}
\email{felixtj.dietrich@tum.de}
\orcid{0009-0007-5826-2061} 
\affiliation{%
	\institution{Technical University of Munich}
	\city{Munich}
	\country{Germany}
}

\author{Stephan Krusche}
\email{krusche@tum.de}
\orcid{0000-0002-4552-644X}
\affiliation{%
	\institution{Technical University of Munich}
	\city{Munich}
	\country{Germany}
}

%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{Sölch et al.}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.
\begin{abstract}

Timely and individualized feedback is essential for students' learning progress and motivation, yet providing such feedback has become increasingly challenging due to growing student numbers.
This has resulted in a time-consuming, repetitive, and often manual task for educators, contributing to a high workload.

This paper presents DAFeeD, an LLM-based approach for automated feedback on student submissions across various exercise domains.
The defined feedback process enables interactive learning by allowing students to submit solutions multiple times and automatically receive iterative LLM feedback on their submission attempts before deadlines.
By incorporating task details, grading criteria, student solutions, and custom instructions into the prompt, DAFeeD provides clear, personalized, and pedagogically meaningful feedback to support continuous improvement.

To evaluate the feedback process, we implemented DAFeeD in an open-source reference implementation integrated into the learning platform LP.
A controlled study with students working on a programming task in a supervised environment showed that students found the feedback relevant and beneficial.
They reported feeling more comfortable and willing to request automated feedback due to its convenience and immediacy.
Additionally, deploying DAFeeD in a software engineering course with 450 students demonstrated improvements in student performance and encouraged iterative refinement through multiple submissions.

These findings highlight DAFeeD's potential to enhance feedback processes in computing education, improving both learning efficiency and student outcomes.

\end{abstract}

%%
%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.
%%
\begin{CCSXML}
  <ccs2012>
     <concept>
         <concept_id>10003456.10003457.10003527.10003540</concept_id>
         <concept_desc>Social and professional topics~Student assessment</concept_desc>
         <concept_significance>500</concept_significance>
         </concept>
     <concept>
         <concept_id>10010405.10010489</concept_id>
         <concept_desc>Applied computing~Education</concept_desc>
         <concept_significance>500</concept_significance>
         </concept>
   </ccs2012>
\end{CCSXML}
  
\ccsdesc[500]{Social and professional topics~Student assessment}
\ccsdesc[500]{Applied computing~Education}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\keywords{Software Engineering, Education, Formative Feedback, Interactive Learning}

% \received{20 February 2007}
% \received[revised]{12 March 2009} 
% \received[accepted]{5 June 2009}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle

\section{Introduction} % 1 page

% problem
% time consuming, not scalable, not always available
% hinders learning progress and motivation

% todo: reference for this! \cite{jonsson:2013:FacilitatingProductiveUse} \cite{shute:2008:FocusFormativeFeedback}
In the current educational landscape, providing timely and effective feedback to students remains a significant challenge for both students and educators.
Traditionally, students must wait for educators to review their submissions and provide feedback.
This process can be time-consuming, often requiring students to arrange meetings and wait for available time slots, which are not always convenient or immediate.
Similar it is time-consuming and tedious for educators to provide asynchronous feedback via email or other communication channels \cite{henderson:2019:ChallengesFeedbackHigher}.
The inherent delays and scheduling difficulties make this approach not scalable, especially in courses with a large number of students.

Providing individualized feedback and enabling students to enhance their knowledge through formative assessments are important components of effective learning \cite{irons:2007:EnhancingLearningFormative,higgins:2002:ConscientiousConsumerReconsidering}.
However, the limited availability of educators means that not all students receive the individualized attention they need to improve their understanding and skills.
This situation underscores the necessity for a more efficient and scalable feedback system that can provide continuous support and feedback to students without the constraints of traditional methods \cite{sondergaard:2004:EffectiveFeedbackSmall}.
Such a feedback system enables interactive learning for students, increasing their engagement in the course resulting in better final grades \cite{krusche:2017:InteractiveLearningIncreasing}.

% Objectives
In this paper, we present Direct Automated Feedback Delivery (DAFeeD), an approach for generating automated feedback on student submissions using the assistance of large language models (LLMs), to address these challenges.
DAFeeD uses the exercise problem statement, predefined grading criteria (when available), the student solution and custom instructions to create a prompt for the LLM with the aim to create didactically meaningful feedback that supports the learning process.
Designed to provide educators with full control and the ability to fine-tune the prompt, DAFeeD aims to reduce time spent on repetitive tasks, allowing educators to focus more on direct interactions with students.
The approach is independent of the exercise type and can be applied and adapted to various domains, such as programming, text, or UML modeling exercises.

We implemented the approach in an open-source reference implementation called FeedbackSystem\footnote{FeedbackSystem - Anonymized due to double-blind review regulations}, connected to the learning platform LP\footnote{LP - Anonymized due to double-blind review regulations} \cite{krusche:2018:ArTEMiSAutomaticAssessmentf} through which students submit their solutions and receive feedback.
To validate the effectiveness and efficiency of the approach, we first tested it in a controlled environment, then employed it in an actual software engineering course.
% We collected quantitative and qualitative data to evaluate students' perceptions of the approach and the overall performance of the reference implementation.
% The results show that students perceive the feedback process as relevant and beneficial to their learning, indicating that the approach has the potential to significantly improve the feedback process in educational institutions and increase students' learning efficiency and performance.
With this paper, we want to answer the following research questions about direct automated feedback delivery:

\begin{enumerate}[label=\textbf{RQ\arabic*},ref=RQ\arabic*]
  % \item \label{RQ1} How does the availability of DAFeeD affect student engagement and motivation? 
  \item \label{RQ1} Do students feel more comfortable requesting automatic feedback from DAFeeD than asking a human educator?
  \item \label{RQ2} How do students perceive the effectiveness of DAFeeD?
  \item \label{RQ3} How do students perceive the usability and helpfulness of DAFeeD?
  \item \label{RQ4} Does the DAFeeD process improve students' performance?
  \item \label{RQ5} How does DAFeeD's feedback compare against human tutor feedback?
\end{enumerate}


%Outline
The subsequent sections of this paper are systematically structured to offer a thorough understanding of the research.
\cref{sec:related-work} provides an overview of related work. 
\cref{sec:approach:DAFeeD} details the concept and methodology of DAFeeD.
\cref{sec:reference-implementation} describes the reference implementation of DAFeeD, called FeedbackSystem, including a general overview, details on the used prompts, and the system architecture. 
\cref{sec:evaluation} describes the study design, presents the evaluation results, and outlines findings and limitations. 
Finally, \cref{sec:conclusion} concludes with a summary of findings and discusses future research directions to enhance automated feedback systems.

%\newpage
\section{Related Work} % 2 page
\label{sec:related-work}

% A Systematic Review of the Effects of Automatic Scoring and Automatic Feedback in Educational Settings
% \citet{hahn:2021:SystematicReviewEffects}
Automated feedback systems have gained significant attention in educational research due to their potential to scale online education and reduce the time between submission and feedback.
\citet{hahn:2021:SystematicReviewEffects} conducted a systematic review on the effects of automatic scoring and feedback tools, emphasizing their crucial role in enhancing scalability, reducing bias, and increasing student engagement.
Their insights highlight the broader implications of automated feedback systems in education, which is highly relevant to this work's study.


% A Systematic Literature Review of Automated Feedback Generation for Programming Exercises
% \citet{keuning:2018:SystematicLiteratureReview} 
%
% -> Extension for potential of LLMs:
% Exploring the Potential of Large Language Models to Generate Formative Programming Feedback
% \citet{kiesler:2023:ExploringPotentialLarge} 
In the domain of programming education, \citet{keuning:2018:SystematicLiteratureReview} reviewed 101 tools for automated feedback on programming exercises.
They noted that most tools focus on error identification rather than providing actionable guidance or adapting to specific instructional needs.
Extending this work, \citet{kiesler:2023:ExploringPotentialLarge} explored the effectiveness of LLMs like ChatGPT in generating formative programming feedback, finding that while LLMs can produce useful feedback, they often include misleading information for novices.
This emphasizes the need for careful design and evaluation of LLM-based feedback systems to ensure reliability and accuracy.


% How Novices Use LLM-based Code Generators to Solve CS1 Coding Tasks in a Self-Paced Learning Environment
% \citet{kazemitabaar:2024:HowNovicesUse}
%
% To trust or to think: Cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making.
% \citet{bucinca:2021:TrustThinkCognitive}
%
% Programming Is Hard - Or at Least It Used to Be: Educational Opportunities and Challenges of AI Code Generation
% \citet{becker:2023:ProgrammingHardLeast}
\citet{kazemitabaar:2024:HowNovicesUse} examined how novice programmers interact with LLM-based code generators in self-paced learning environments. 
They identified distinct usage patterns and their impact on learning outcomes, revealing that a ``Hybrid'' approach — combining manual coding with LLM assistance — was most beneficial for learners. 
This aligns with findings by \citet{bucinca:2021:TrustThinkCognitive}, who highlighted the dangers of over-reliance on AI and proposed cognitive forcing functions to encourage deeper engagement with AI outputs. 
Similarly, \citet{becker:2023:ProgrammingHardLeast} discussed both the opportunities and challenges of AI-driven code generation tools, emphasizing the need for educators to guide students in leveraging these technologies effectively without becoming dependent on them. 
These findings highlight the importance of balancing AI assistance with traditional learning methods, which is a key consideration in the DAFeeD approach.


% Focus on Formative Feedback
% \citet{shute:2008:FocusFormativeFeedback} 
%
% What makes for effective feedback: staff and student perspectives
% \citet{dawson:2019:WhatMakesEffective}
The importance of timely and specific feedback is well-documented.
\citet{shute:2008:FocusFormativeFeedback} provided a comprehensive review of formative feedback, highlighting its necessity for being nonevaluative, supportive, timely, and specific.
\citet{dawson:2019:WhatMakesEffective} further explored perceptions of effective feedback, revealing that while educators focus on design aspects such as timing and modalities, students prioritize the quality and usability of feedback comments.
This underscores the need for automated feedback systems to deliver not only timely but also detailed, specific, and personalized comments.


% Adaptive Immediate Feedback Can Improve Novice Programming Engagement and Intention to Persist in Computer Science
% \citet{marwan:2020:AdaptiveImmediateFeedback}
%
% A Comparison of Immediate and Scheduled Feedback in Introductory Programming Projects
% \citet{leinonen:2022:ComparisonImmediateScheduled}
Adaptive and immediate feedback mechanisms have been shown to significantly enhance student learning outcomes.
\citet{marwan:2020:AdaptiveImmediateFeedback} demonstrated that adaptive and immediate feedback can improve student performance and motivation.
Similarly, \citet{leinonen:2022:ComparisonImmediateScheduled} compared immediate and scheduled feedback, concluding that immediate feedback is more effective in promoting student engagement and timely corrections.
These studies collectively stress the potential of automated feedback systems in providing timely, adaptive, and engaging feedback, crucial for continuous improvement and learning efficiency.


% Feedback-Generation for Programming Exercises With GPT-4:
% Imen Azaiz, Natalie Kiesler, and Sven Strickroth. 2024. Feedback-Generation for Programming Exercises With GPT-4. arXiv:2403.04449 [cs]
% Accepted at ITiCSE '24, first author is from LMU
The work by \citet{azaiz:2024:FeedbackGenerationProgrammingExercises} highlights the limitations of LLMs, advising against using GPT-4 Turbo for automatic feedback generation in programming education due to inconsistencies.
In contrast, this work's research with DAFeeD evaluates an integrated direct automatic feedback delivery process within a learning management system (LMS), demonstrating its potential immediate benefits for students, especially when feedback is critically evaluated.
We believe that increasingly powerful LLMs and advanced prompting strategies will enhance feedback quality over time, with appropriate guardrails to prevent revealing solutions.


% CodeHelp: Using Large Language Models with Guardrails for Scalable Support in Programming Classes
% \citet{liffiton:2024:CodeHelpUsingLarge}
The study by \citet{liffiton:2024:CodeHelpUsingLarge} introduces CodeHelp, an LLM-powered tool that provides real-time assistance to programming students.
In a first-year computer science course with 52 students, CodeHelp collected data over 12 weeks, revealing that students valued its availability, immediacy, and support for error resolution and independent learning.
CodeHelp requires students to manually enter code, error messages, and issue descriptions.
In contrast, DAFeeD integrates into the LMS, automatically providing context and feedback on code repository changes without requiring student input.
This seamless integration aims to increase student engagement and motivation by offering timely, individualized feedback automatically, and to improve perceptions of feedback effectiveness, usability, and helpfulness.


% Using GPT-4 to Provide Tiered, Formative Code Feedback
% \citet{nguyen:2024:UsingGPT4Providea}
Similarly, \citet{nguyen:2024:UsingGPT4Providea} demonstrate the feasibility of using GPT-4 for tiered, formative feedback on programming exercises in introductory courses, providing insights on conceptual understanding, syntax, and time complexity.
The DAFeeD approach proposed in this paper is evaluated using a similar LLM, GPT-4 Turbo, and likewise focuses on providing formative feedback in introductory courses.
However, while \citet{nguyen:2024:UsingGPT4Providea} provide feedback on isolated code snippets using few-shot learning, DAFeeD delivers iterative feedback on entire repositories with multiple files using detailed prompts and context collection.
Additionally, DAFeeD is integrated directly into an LMS, supporting multiple exercise domains, which allows students to iteratively improve their submissions before the deadline, aiming to enhance learning outcomes and engagement through interactive learning.


% AI Teaches the Art of Elegant Coding: Timely, Fair, and Helpful Style Feedback in a Global Course
% \citet{woodrow:2024:AITeachesArta}
\citet{woodrow:2024:AITeachesArt} explore the deployment and effectiveness of a real-time style feedback tool using LLMs, specifically GPT-3.5 Turbo, in a large-scale online CS1 course.
Their findings indicate significant improvements in student engagement and coding style when feedback is immediate and integrated within the learning platform.
\citeauthor{woodrow:2024:AITeachesArt} conducted a randomized control trial with over 8,000 students, demonstrating that real-time feedback was five times more likely to be viewed and incorporated by students compared to delayed feedback.
This supports the approach with DAFeeD, emphasizing the importance of immediate, individualized feedback in enhancing student learning outcomes.

% UML Modeling
Besides programming exercises, recent studies have explored using LLMs to provide automated feedback in UML modeling education.
\citet{ardimento:2024:TeachingUMLUsinga} introduced a cloud-based tool that utilizes Retrieval-Augmented Generation (RAG) to analyze UML diagrams and offer contextually relevant suggestions.
An evaluation with 5,120 labeled UML models demonstrated its effectiveness in helping students identify and correct common errors, enhancing their understanding of UML concepts.
A follow-up study further improved error detection and provided real-time, constructive feedback, with user feedback highlighting the tool's potential to enhance software modeling education \cite{ardimento:2024:RAGbasedFeedbackTool}.

Similarly, \citet{camara:2023:AssessmentGenerativeAI} examined generative AI tools, such as ChatGPT, for formative assessment in UML modeling tasks.
Their findings show that AI-assisted feedback helps students track their progress and improve performance compared to traditional methods.
However, the study also emphasizes the importance of educating students on AI limitations to prevent over-reliance on automated feedback.

A broader perspective on the capabilities and challenges of LLMs is provided by \citet{wei:2022:EmergentAbilitiesLarge}, who discussed the emergent abilities of LLMs that are not present in smaller models, highlighting the need for ongoing research to harness these capabilities effectively.
\citet{huang:2023:SurveyHallucinationLargeb} addressed the issue of hallucinations in LLMs, offering an in-depth overview of detection methods and mitigation strategies crucial for developing reliable feedback systems.
\citet{amatriain:2024:PromptDesignEngineering} detailed core concepts and advanced techniques in prompt engineering, such as Chain-of-Thought and Reflection, which can enhance the quality and relevance of automated feedback.
Additionally, \citet{liu:2024:JailbreakingChatGPTPrompt} highlighted the importance of robust prompt design to prevent misuse, investigating vulnerabilities of LLMs to jailbreak prompts.
\citet{zhao:2023:SurveyLargeLanguage} reviewed the evolution and recent advances of LLMs, focusing on pre-training, adaptation tuning, utilization, and capacity evaluation, and highlighting the progress and ongoing challenges in the field.
Lastly, \citet{yang:2024:HarnessingPowerLLMs} provided a comprehensive guide for practitioners working with LLMs, discussing the influence of pre-training data and challenges associated with different natural language processing tasks, offering insights for developing and deploying LLM-based feedback systems.

In summary, the related work collectively highlights the evolving landscape of automated feedback systems and the significant potential of LLMs to enhance educational outcomes through immediate, specific, and actionable feedback.
The primary contribution with DAFeeD lies in its seamless integration within the LMS, supporting a variety of exercise types and having access to all relevant context information used for the feedback prompt.
By using LLMs, DAFeeD provides individualized feedback automatically, with a focus on the feedback delivery process.
This approach enables students to iteratively improve their solutions and learn continuously without direct intervention from tutors or professors.
All of this is possible without leaving their preferred learning platform.
DAFeeD aims to enhance student engagement, learning efficiency, and performance through timely, relevant, and personalized feedback, aligning with and advancing the findings of the reviewed studies.

% ================

% A review of automated feedback systems for learners: Classification framework, challenges, and opportunities.
% \citet{deeva:2021:ReviewAutomatedFeedback}
% Felix: Won't include (for now)

% Grounded copilot: How programmers interact with code-generating models.
% \citet{barke:2023:GroundedCopilotHow}
% Felix: Won't include (for now)

% Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming
% \citet{kazemitabaar:2023:StudyingEffectAIa
% Felix: Won't include (for now)

% Exploring the Responses of Large Language Models to Beginner Programmers’ Help Requests
% \citet{hellas:2023:ExploringResponsesLargea}
% Felix: Won't include (for now)

% Investigating the Potential of GPT-3 in Providing Feedback for Programming Assessments
% \citet{balse:2023:InvestigatingPotentialGPT3}
% Felix: Won't include (for now)

% Using Large Language Models to Enhance Programming Error Messages
% \citet{leinonen:2023:UsingLargeLanguagea}
% Felix: Won't include (for now)

% Next-Step Hint Generation for Introductory Programming Using Large Language Models
% \citet{roest:2024:NextStepHintGeneration}
% Felix: Won't include (for now)

% Design principles for generating and presenting automated formative feedback on code quality using software metrics
% \citet{vandenaker:2024:DesignPrinciplesGeneratingb}
% Felix: Won't include (for now)

% PyDex: Repairing Bugs in Introductory Python Assignments using LLMs
% \citet{zhang:2024:PyDexRepairingBugs}
% Felix: Won't include (for now)

% =====================

%\newpage
\section{Approach: Direct Automated Feedback Delivery (DAFeeD)} % 1 page
\label{sec:approach:DAFeeD}

To complement traditional teaching methods and provide additional support, DAFeeD employs LLMs to deliver automated feedback on student submissions.
Figure \ref{fig:DAFeeD-workflow} illustrates the continuous feedback workflow that DAFeeD facilitates, enabling students to receive feedback at any time, thereby eliminating the need to wait for responses from human educators.

\begin{figure*}[htbp]
  \centering
  \includegraphics[width=\linewidth]{figures/DAFeeD-ActivityDiagram_alternative.pdf}
  \vspace{-6mm}
  \caption{UML activity diagram of the Direct Automated Feedback Delivery (DAFeeD) workflow for student submissions.}
  \label{fig:DAFeeD-workflow}
  \vspace{-3mm}
\end{figure*}

The feedback process is designed to be exercise-independent, meaning that it can be applied and adapted to various exercise types, such as programming, text, or modeling exercises.
DAFeeD can automatically provide feedback to the students, including feedback on issues or improvements, as well as positive feedback when the student completes the task correctly.
Once the student submits their solution, DAFeeD initiates a three-stage process to generate natural language feedback.

The first stage, called \textit{Formatting}, takes the student's submission and extracts the submission content, the problem statement, including learning objectives, and any possible grading instructions the educator defines.
In addition, the student's learner profile \cite{alexander:1998:ProfilingDifferencesStudents} is considered to generate individualized, personal feedback tailored to the student's skills and learning styles.
All of this gathered information represents the context.

During the prompt generation step, a predefined prompt template is filled with the individual prompt input data that is included in the context, resulting in the feedback prompt.
Depending on the exercise, adaptions need to be made to the prompt template to ensure that the feedback output of the LLM is tailored to the specific exercise type.
For programming exercises, the generated feedback needs to have metadata information about the file and line number of the code snippet to which the feedback refers.
In the case of text exercises, the feedback needs to contain metadata identifying the specific sentence or word range to which it applies.
Similarly, for modeling exercises, metadata must reference the corresponding model element or relation to ensure precise feedback alignment.

In the second stage, called \textit{Predicting}, DAFeeD sends the feedback prompt to a LLM and invokes it with the prompt. 
As a result, the LLM generates a response to that prompt including detailed feedback items for the student.

The final stage, \textit{Parsing}, takes the LLM response, which comes in the JSON format, and parses feedback items from it. 
In addition to the feedback text, the feedback object also contains reference information indicating the part of the submission it pertains to.
For programming exercises, this includes the file name and line number of the relevant code snippet to which the feedback refers.
For text exercises, the reference information includes only the corresponding sentence or word range.
When it comes to modeling exercises, the feedback needs to reference the specific model element or relation it pertains to.

All of the feedback is then returned to the student for review.
If the student is satisfied with the feedback, the process concludes. 
Otherwise, the student can refine and resubmit their solution, initiating the DAFeeD process anew.

This iterative process is designed to motivate students to continuously learn and experiment with their solutions, resulting in improved performance.

\section{Reference Implementation: FeedbackSystem} % 2 pages
\label{sec:reference-implementation}

We incorporated DAFeeD into a reference implementation named FeedbackSystem, which is seamlessly integrated with the learning platform LP. 
Through LP, students can submit their solution and review the feedback.
FeedbackSystem supports the feedback generation for programming, text, and modeling exercises.

When submitting their solutions on LP, students have the option to request direct automated feedback by clicking a dedicated button.
This feedback request is then sent to FeedbackSystem, assuming the student has not reached their feedback request limit for the exercise.
Educators can customize the limit of allowed feedback requests per exercise according to their preference.
A status visualization informs students about their feedback request state.
Once FeedbackSystem generates the feedback and sends it back to LP, the student can review it in an inline feedback view window on LP.
The feedback view is tailored to different exercise types, ensuring that feedback is presented in a format most suitable for the nature of the task.
For instance, text exercises include detailed inline comments, while modeling exercises feature visual annotations on model elements and relations.
An example visualization of the inline feedback interface for a text exercise submission is depicted in Figure \ref{fig:Artemis-feedback-visualization}.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=\linewidth]{figures/text-feedback-viewer-cut.png}
  \vspace{-5mm}
  \caption{Visualization of the inline feedback interface for text exercises in the LP as seen by students.}
  \label{fig:Artemis-feedback-visualization}
  \vspace{-3mm}
\end{figure}

The inline feedback view for text exercises offers a targeted approach by directly associating feedback items with specific text spans, such as highlighting the second sentence in the example shown.
The interface also distinguishes between referenced feedback, tied to specific text, and unreferenced feedback, offering general observations.


\subsection{Feedback Generation}

The prompt design is crucial for guiding the LLM in generating effective and contextually relevant feedback.
The system is configurable, allowing the use of different LLMs and model settings. 
In Listing \ref{lst:prompt-programming-exercise-generation}, we provide an example of a prompt used for generating feedback for programming exercises. 
This prompt incorporates specific instructions to ensure that the feedback is individualized to the student's submission while not revealing the solution.

The feedback generation process for programming exercises begins by identifying the differences between the student's submission repository and the provided template repository. 
These differences are identified using a git diff, which highlights the lines removed and added by the student. 
%TODO Felix: How does the splitting work
If the problem statement is too lengthy or complex, a separate LLM invocation is used to split the problem statement into relevant parts for each file. 
This ensures that the feedback is targeted and relevant to the specific context of the file being reviewed. 
Additionally, a summary of the student's solution across all files is generated using another LLM invocation. 
This summary provides a comprehensive overview of the submission, which is included in the prompt to offer context for the feedback.

In the provided prompt, several key components guide the LLM in creating useful feedback. The \textit{Problem Statement} section contextualizes the student's task and helps the LLM understand the exercise's objectives.
The \textit{Task Instructions} direct the LLM to provide improvement suggestions focusing on educational aspects without offering direct solutions.
\textit{Style Guidelines} ensure the feedback is constructive, specific, balanced, clear, concise, actionable, educational, and contextual.
The \textit{File Path and Content} provide the specific file under review along with its content, aiding the LLM in pinpointing specific lines of code for feedback.
Additionally, \textit{Summary and Diffs} between the template and submission offer additional context, helping the LLM understand the student's changes and their overall approach.

In contrast to other external AI tools such as ChatGPT, FeedbackSystem is integrated directly into the LP and has direct access to every relevant context information, providing immediate feedback to students without requiring manual input.
With FeedbackSystem in place, students do not need to switch between different windows and do not need to copy-paste their solution or the problem statement into other tools. 
FeedbackSystem is also designed in a way that it provides didactically sensible feedback without giving away the solution, which is crucial for educational purposes.

The structure and content of this prompt are designed to emulate a human tutor's approach, ensuring that the feedback is both relevant and supportive of the student's learning process.
By providing such detailed instructions and contextual information, the LLM can generate feedback that is both useful and actionable for students.

% Prompt for generating feedback suggestions of programming exercises
% \noindent\begin{minipage}{\linewidth}
\begin{lstlisting}[style=prompt, basicstyle=\footnotesize, columns=fullflexible, caption={Prompt template for generating feedback for programming exercises.}, captionpos=b, label=lst:prompt-programming-exercise-generation]
You are an AI tutor for programming exercises at a prestigious university.

# Problem statement
%*\highlight{\{problem\_statement\}}*)

# Task
Create non graded improvement suggestions for a student's programming submission that a human tutor would recommend. Assume the tutor is not familiar with the solution. The feedback must contain only the feedback the student can learn from. Important: the answer you generate must not contain any solution suggestions or contain corrected errors. Rather concentrate on incorrectly applied principles or inconsistencies. Students can move some functionality to other files. Students can deviate to some degree from the problem statement or book unless they complete all tasks. Very important, the feedback must be balanced.

# Style
1. Constructive, 2. Specific, 3. Balanced, 4. Clear and Concise, 5. Actionable, 6. Educational, 7. Contextual

It is strictly prohibited to include feedback that contradicts to the problem statement.
No need to mention anything that is not explicitly in the template->submission diff, as it is out of student's control (e.g. exercise package name).

In git diff, lines marked with '-' were removed and with '+' were added by the student.

# The student will be reading your response, use you instead of them

Path: %*\highlight{\{submission\_file\_path\}}*)

File (with line numbers <number>: <line>):
%*\highlight{\{submission\_file\_content\}}*)

Summary of other files in the solution:
%*\highlight{\{summary\}}*)

The template to submission diff (only as reference):
%*\highlight{\{template\_to\_submission\_diff\}}*)
\end{lstlisting}
\vspace{-5mm}
% \end{minipage}

\subsection{Architecture}

FeedbackSystem is deployed in production alongside the learning platform LP, which serves up to 2000 students per course.
Consequently, the reference implementation must satisfy additional non-functional requirements such as performance, scalability, maintainability, and usability.
To meet these requirements and to support feedback generation for multiple exercise types while allowing for future extensibility, we adopted a modular architecture, as illustrated in Figure \ref{fig:Athena-architecture}.

% Felix:
% - Components could be vertically slightly smaller 
\begin{figure}[htbp]
  \vspace{-4mm}
  \centering
  \includegraphics[width=\linewidth]{figures/Athena-Architecture.pdf}
  \caption{UML component diagram showing the top-level architecture of the reference implementation.}
  \label{fig:Athena-architecture}
  \vspace{-4mm}
\end{figure}

The \textit{Module Manager} handles all incoming requests, verifies authorization, and forwards them to the appropriate modules.
The \textit{ProgrammingLLM} module manages programming exercises and executes the three-stage DAFeeD process, which includes formatting, predicting, and parsing. 
Similarly, the \textit{TextLLM} and \textit{ModelingLLM} module are optimized for text and modeling exercises, respectively, and follow the same process.

FeedbackSystem's system design is independent of any specific learning management system (LMS) as it provides a REST API, documented using the OpenAPI standard\footnote{\url{https://www.openapis.org}}.
This independence allows FeedbackSystem to be integrated with various LMS platforms, such as Moodle\footnote{\url{https://moodle.org}}, bringing the benefits of DAFeeD to more universities and students.

FeedbackSystem currently connects to OpenAI models hosted in a private Azure cloud to ensure that student data is not used for training models, maintaining privacy.
Additionally, the system can be configured to use open-source models like Llama\footnote{\url{https://llama.meta.com}} or Mistral\footnote{\url{https://mistral.ai}}, either self-hosted or cloud-based.

To meet performance and scalability requirements, the FeedbackSystem and its modules are deployed within a Kubernetes cluster\footnote{\url{https://kubernetes.io}}.
Kubernetes, in conjunction with the FeedbackSystem's modular architecture, allows the system to scale each module independently.
For example, additional instances of the programming module can be instantiated when a new programming exercise is released to handle the increased load.
Furthermore, Kubernetes provides out-of-the-box load balancing to distribute the load between multiple module instances and self-healing capabilities, ensuring that if a module crashes, it gets automatically restarted.
% TODO MS: It is very important that students have always access to the feedback

\section{Evaluation} % 5 pages
\label{sec:evaluation}

In this section, we outline the methodology employed to validate the effectiveness of the proposed DAFeeD approach including the reference implementation FeedbackSystem.
The conducted evaluation represents the treatment validation phase of the design science methodology proposed by Wieringa \cite{wieringa:2014:DesignScienceMethodologya}.
For this phase, we first evaluated the proposed solution — DAFeeD — in a controlled environment followed by a field study.
The collected data is then utilized for the refinement and improvement of the solution.

We begin by describing the study design and the results.
Subsequently, we outline the limitations of the evaluation and discuss the implications of the findings.


% research questions moved to the introduction
% \subsection{Research Questions}

% With this study, we want to answer the following research questions about direct automated feedback delivery:

% \begin{enumerate}[label=\textbf{RQ\arabic*},ref=RQ\arabic*]
%   \item \label{RQ1} How does the availability of direct automated feedback affect student engagement and motivation? 
%   \item \label{RQ2} Do students feel more comfortable requesting automatic feedback than asking a human tutor or the course professor?
%   \item \label{RQ3} How do students perceive the effectiveness of direct automated feedback?
%   \item \label{RQ4} How do students perceive the usability and helpfulness of DAFeeD?
% \end{enumerate}

% \ref{RQ1} examines the influence of direct automated feedback on students' overall engagement with the course material and their motivation to complete exercises and improve their skills.
% We aim to determine if the immediacy and convenience of automated feedback enhance students' commitment to their coursework.
% \ref{RQ2} explores students' comfort levels with seeking feedback from an automated system compared to traditional sources such as human tutors or course professors. 
% We want to assess whether students prefer the anonymity and immediacy of automated feedback over potentially intimidating interactions with educator.
% \ref{RQ3} seeks to understand students' views on the value and impact of the feedback provided by the DAFeeD system. 
% We are interested in whether students find the feedback to be relevant and beneficial to their learning process.
% \ref{RQ4} focuses on students' perceptions of the usability and helpfulness of the DAFeeD system. 
% We aim to evaluate how intuitive and user-friendly students find the system, as well as how effective they consider it in assisting their learning and improving their performance.

\subsection{Study Design}
The evaluation of the DAFeeD approach was conducted in two distinct stages to ensure a comprehensive assessment of its effectiveness.
The first stage involved a controlled lab experiment where selected students interacted with the system and subsequently provided their perceptions through a structured survey.
This stage aimed to capture initial user impressions and identify potential usability issues.
In the second stage, DAFeeD and its reference implementation, FeedbackSystem, were deployed in an advanced software engineering course on design patterns.
During this stage, quantitative data was collected to evaluate both student performance and system effectiveness in a real-world educational setting.


For the first stage, we designed the ``Code Review'' Java programming exercise, a past introduction to software engineering homework assignment, to simulate a real-world scenario where students review and improve existing code.
The exercise included tasks such as improving Java classes by following good coding practices, refactoring duplicated code using the template method design pattern, catching edge cases in functions, and implementing forgotten methods in the service package.
We enabled FeedbackSystem for this exercise, utilizing OpenAI's GPT-4 Turbo with a temperature setting of 0 to ensure deterministic and consistent feedback generation, reducing randomness and the potential for hallucinations.

We invited 20 participants from current courses at the university via direct messages, including undergraduate and graduate students from various disciplines like computer science, information systems, and games engineering. 
Participants received a two-page participation manual and tested the new feedback feature on the LP in a controlled university environment. 
The evaluation lasted around 45 minutes, focusing on understanding the feedback process rather than completing the exercise.
Participants followed a structured procedure, illustrated in Figure \ref{fig:Study-Design}.
They started by thoroughly reading the participation manual.
Next, they prepared their IDE and accessed the exercise on LP.
They then worked on the exercise, committed and pushed their code, and requested AI feedback iteratively.
After reviewing and acting on the AI-provided feedback, they refined their solutions until they had a good understanding of the feedback process.

\begin{figure}[htbp]
  \vspace{-3mm}
  \centering
  \includegraphics[width=\linewidth]{figures/Study-Design.pdf}
  \vspace{-5mm}
  \caption{UML activity diagram illustrating the study procedure from a participant's perspective.}
  \label{fig:Study-Design}
  \vspace{-3mm}
\end{figure}

Following this hands-on experience, participants were asked to complete a survey hosted on the community version of the open-source survey tool LimeSurvey\footnote{\url{https://www.limesurvey.org}}.
This survey aimed to gather their opinions on direct automated feedback and collect feedback on their overall experience with the feature.
The study employed a mixed methods approach, combining quantitative and qualitative data collection methods.

All survey questions, except for the introductory demographic queries and five final voluntary free-text responses, employed a 5-point Likert scale \cite{allen:2007:LikertScalesData} ranging from ``strongly agree'' to ``strongly disagree'' and were mandatory. 
The survey questions mapped to the research questions as follows:

\begin{enumerate}[label=\textbf{RQ\arabic*}]
  % \item \textbf{Engagement and Motivation}
  %   \begin{enumerate}[label=\textbf{Q\arabic*},resume,ref=Q\arabic*]
  %     \item \label{Q1} The direct automated feedback keeps me more engaged in the learning process.
  %     \item \label{Q2} The direct automatic feedback motivates me to repeatedly improve my code.
  %     \item \label{Q3} The direct automated feedback makes me feel more motivated to complete my programming assignments.
  %     \item \label{Q4} The direct automated feedback encourages me to experiment more with my coding solutions.
  %   \end{enumerate}
  \item \textbf{Comfort with Feedback Source}
  \begin{enumerate}[label=\textbf{Q\arabic*},resume,ref=Q\arabic*, leftmargin=*, itemindent=1.5em]
    \item \label{Q1} I feel more comfortable requesting direct automated feedback than feedback from a human tutor.
    \item \label{Q2} I am likely to request feedback more frequently when using direct automated feedback than feedback from my course professor.
    \item \label{Q3} I find receiving direct automated feedback less intimidating than receiving feedback from a human tutor.
    \item \label{Q4} I feel that requesting direct automated feedback is more convenient than arranging a meeting with a human tutor.
  \end{enumerate}
  \item \textbf{Perceived Effectiveness}
    \begin{enumerate}[label=\textbf{Q\arabic*},resume,ref=Q\arabic*, leftmargin=*, itemindent=1.5em]
      \item \label{Q5} The direct automated feedback helps me understand my mistakes.
      \item \label{Q6} The direct automated feedback is more effective than one-time feedback.
      \item \label{Q7} The direct automated feedback has significantly improved the quality of my programming assignment.
      \item \label{Q8} The direct automated feedback is a helpful addition to the automatic test case results.
      \item \label{Q9} I feel that having access to direct automated feedback continuously helps me more than arranging a meeting with a human tutor.
    \end{enumerate}
  \item \textbf{Usability and Helpfulness}
    \begin{enumerate}[label=\textbf{Q\arabic*},resume,ref=Q\arabic*, leftmargin=*, itemindent=1.5em]
      \item \label{Q10} It is easy to receive direct automated feedback on my programming assignments.
      \item \label{Q11} I would rather use the direct automated feedback integrated into LP than use an external AI tool for getting feedback.
      \item \label{Q12} I find the direct automated feedback helpful in improving my programming skills.
      \item \label{Q13} I am satisfied with the overall performance of the direct automated feedback.
      % \item \label{Q18} Are there any improvements that you would suggest for direct automated feedback?
      % \item \label{Q19} How did you find the feedback?
      % \item \label{Q20} What kind of feedback would you like to receive?
      % \item \label{Q21} Was there anything you particularly liked about the direct automated feedback process?
      % \item \label{Q22} What difficulties did you encounter when using the direct automated feedback process?
    \end{enumerate}
\end{enumerate}

%todo: Describe free text questions

% %Question Group 1
% The survey commences with a series of introductory demographic inquiries, including the student's current study program, ongoing degree pursued, and academic semester. 
% Additionally, respondents are asked to provide information regarding the hardware employed, including details on the operating system and web browser used.
% The final question in this group asks participants to describe their programming experience.

% %Question Group 2
% Following the demographic section, the survey explores participants' opinions on the impact of direct automated feedback on student engagement and motivation.
% %Question Group 3
% Subsequently, the survey queries participants on their comfort with the feedback source, specifically whether they prefer receiving feedback from a human tutor or from the DAFeeD feature.
% %Question Group 4
% The survey then investigates aspects related to participants' perceived effectiveness of the DAFeeD feature.
% %Question Group 5
% In the final question group, participants assess the overall usability and helpfulness of the DAFeeD feature.

Following the initial lab experiment and survey, the second stage of the evaluation involved deploying the FeedbackSystem in the \textit{Software Design Patterns} course, an advanced software engineering lecture with more than 500 active bachelor's and master's students.
This stage aimed to assess the system's performance and scalability in a real-world educational setting.
The deployment allowed for an in-depth investigation of the system's impact on student performance (\ref{RQ4}) and how it compares to human feedback (\ref{RQ5}).

The FeedbackSystem, configured with OpenAI's GPT-4o LLM, was enabled for two exercises, a modeling exercise and a text exercise.
In the text exercise, students had to explain the difference between the strategy and the bridge design pattern, and could achieve a maximum of 4 points.
The modeling exercise required students to create a UML class diagram of a car rental system and was worth 7 points. 

Students could submit their solutions and iteratively request automated feedback to improve their work.
%todo Max: add that the tutors had access to AI feedback suggestions
After the exercises' deadline, which was one week after the release, all submissions were assessed by human tutors to determine the final scores.
Based on the usage data collected during this stage, we analyzed student improvement over multiple submission iterations and also compared the feedback scores from FeedbackSystem against the human tutors' assessments. 


\subsection{Results}

In the following paragraphs, we present the results, starting with stage one, the controlled lab experiment, and then moving on to stage two, the deployment in the advanced software engineering course.
The answers to each of the Likert scale questions are visualized in Figure \ref{fig:Survey-Results}.

\begin{figure}[htpb]
  \centering
  \resizebox{\linewidth}{!}{
    \input{figures/likert_responses-adapted.pgf}
  }
  \vspace{-6mm}
  \caption{Distribution of survey responses to Likert scale questions.}
  \label{fig:Survey-Results}
\end{figure}


% 90\% of students indicated that the direct automated feedback from LP keeps them more engaged in the learning process, with 10\% neutral (\ref{Q1}).
% For \ref{Q2}, 85\% of students stated that the direct automatic feedback motivates them to repeatedly improve their code, with 10\% neutral and 5\% disagreeing.
% For \ref{Q3}, 85\% of students mentioned that the direct automated feedback makes them feel more motivated to complete their programming assignments, with 10\% neutral and 5\% disagreeing.
% The encouragement to experiment more with coding solutions (\ref{Q4}) received a positive response, with 70\% agreeing, 20\% neutral, and 10\% disagreeing.


Comfort levels in requesting feedback showed that 90\% of students feel more comfortable requesting direct automated feedback from LP than feedback from a human tutor, with 10\% neutral (\ref{Q1}).
For \ref{Q2}, 100\% of students noted that they are likely to request feedback more frequently when using direct automated feedback than feedback from their course professor.
Receiving feedback from LP was found to be less intimidating than from a human tutor by 80\% of students, with 10\% neutral and 10\% disagreeing (\ref{Q3}).
Convenience in requesting feedback showed that 100\% of students feel that requesting direct automated feedback is more convenient than arranging a meeting with a human tutor (\ref{Q4}).


In terms of understanding mistakes, 90\% of students believed that the direct automated feedback provided by LP helps them understand their mistakes, with 10\% neutral (\ref{Q5}).
The effectiveness of the feedback was highlighted by 95\% of students who found that the direct automated feedback is more effective than one-time feedback, with 5\% neutral (\ref{Q6}).
Regarding the quality of assignments, 70\% of students observed that DAFeeD has significantly improved the quality of their programming assignments, with 20\% neutral and 10\% disagreeing (\ref{Q7}).
For \ref{Q8}, 95\% of students felt that the direct automated feedback is a helpful addition to the automatic test case results, with 5\% neutral.
Continuous access to feedback was found to be more beneficial than arranging meetings with a tutor by 75\% of students, with 15\% neutral and 10\% disagreeing (\ref{Q9}).


Ease of receiving feedback was highly rated, with 100\% of students confirming that it is easy to receive direct automated feedback on their programming assignments (\ref{Q10}).
Furthermore, 70\% of students preferred using DAFeeD integrated into LP over using an external AI tool for getting feedback, with 25\% neutral and 5\% disagreeing (\ref{Q11}).
In terms of skill improvement, 70\% of students agreed that they find the direct automated feedback helpful in improving their programming skills, with 25\% neutral and 5\% disagreeing (\ref{Q12}).
Lastly, 80\% of students were satisfied with the overall performance of DAFeeD, with 10\% neutral and 10\% disagreeing (\ref{Q13}).


% free text responses
% todo: add Question numbers and some more details
The responses to the five voluntary free text questions highlight several themes.
Many students appreciated the immediate availability of feedback, which allowed for prompt corrections without waiting for manual review. 
However, some respondents suggested improvements such as better categorization of feedback, more detailed explanations of errors, and prioritization of critical issues. 
The feedback was generally found to be relevant and useful in addressing obvious mistakes and improving code quality. 
Students expressed a preference for feedback that clearly identified mistakes and provided specific guidance on how to correct them, along with suggestions for improvement. 
Some challenges included understanding certain automated feedback messages and occasional false positives or negatives in error detection.


For stage two we first analyzed the distribution of scores across submission attempts for the modeling and text exercises, as shown in Figure \ref{fig:score-distribution-by-submission-attempt}.
Since students were not obliged to request automatic feedback, the sample sizes differ across submission attempts.
The results indicate an overall improvement in scores with subsequent submission attempts.

\begin{figure}[htbp]
  \vspace{-3mm}
  \centering
  \begin{subfigure}{0.40\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/distribution-submission-attempts-modeling.pdf}
    \vspace{-6mm}
    \caption{Score distribution for the modeling exercise.}
    \label{fig:distribution-submission-attempts-modeling}
  \end{subfigure}
  \begin{subfigure}{0.40\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/distribution-submission-attempts-text.pdf}
    \vspace{-6mm}
    \caption{Score distribution for the text exercise.}
    \label{fig:distribution-submission-attempts-text}
  \end{subfigure}
  \vspace{-2mm}
  \caption{Distribution of FeedbackSystem's scores across submission attempts.}
  \label{fig:score-distribution-by-submission-attempt}
  \vspace{-3mm}
\end{figure}

In the modeling exercise (Figure \ref{fig:distribution-submission-attempts-modeling}), the average scores increased steadily across multiple submission attempts.
The initial average score was 44\% in the first attempt, rising to 61\% by the sixth attempt.
However, a slight decline was observed in the seventh attempt, where the score dropped to 59\%.
The scores then showed a peak of 65\% in the ninth attempt.
After this peak, the scores declined down to 57\% in the eleventh attempt.

For the text exercise (Figure \ref{fig:distribution-submission-attempts-text}), the average scores exhibited a more consistent upward trajectory.
The initial average score of 64\% in the first attempt increased significantly to 82\% by the second attempt and further improved to 88\% by the third attempt.
Despite a minor decline to 80\% in the fourth attempt, subsequent submissions saw an upward trend from 83\% in the fifth to 100\% in the sixth attempt.


We analyzed the score differences between the FeedbackSystem and human tutors for both exercises, as illustrated in Figure \ref{fig:score-differences}.
To ensure a fair comparison, only the latest student submissions that received feedback from the FeedbackSystem and remained unchanged until the exercise deadline were considered.
This approach guarantees that both the FeedbackSystem and human tutors assessed identical versions of student submissions.

The score differences were computed by subtracting the FeedbackSystem's assigned score from the human tutor's score, where negative values indicate instances in which the human tutor assigned a lower score than the FeedbackSystem.
To facilitate the visualization of a 1-point score deviation, the data was grouped into equal-sized intervals based on the total achievable points for each exercise, with each interval representing 15\% (1/7) of the total score for the modeling exercise and 25\% (1/4) for the text exercise.

\begin{figure}[htbp]
  \centering
  \begin{subfigure}{0.40\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/score-difference-modeling.pdf}
    \vspace{-6mm}
    \caption{Score differences for the modeling exercise.}
    \label{fig:score-differences-modeling}
  \end{subfigure}
  \begin{subfigure}{0.40\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figures/score-difference-text.pdf}
    \vspace{-6mm}
    \caption{Score differences for the text exercise.}
    \label{fig:score-differences-text}
  \end{subfigure}
  \vspace{-2mm}
  \caption{Distribution of score differences between the FeedbackSystem and a human tutor. Negative values indicate a lower score from the tutor.}
  \label{fig:score-differences}
\end{figure}

The analysis of score differences between the FeedbackSystem and the human tutors revealed distinct trends for the modeling and text exercises.
For the modeling exercise, the FeedbackSystem generally assigned lower scores compared to the human tutors, as visualized in Figure \ref{fig:score-differences-modeling}.
29\% of submissions received scores that were between 15\% and 30\% higher when graded by the human tutors.
Additionally, 18\% had smaller underestimations, with differences ranging from 0\% to 15\%.
In contrast, the FeedbackSystem provided higher scores than the human tutors for 27\% of submissions, with differences ranging from -15\% to 0\%.
A moderate overestimation was observed in five submissions (9\%), with score differences between -30\% and -15\%, while extreme overestimations were rare (2\%) falling within the -45\% to -30\% range.
7\% of submissions received identical scores.

For the text exercise, the FeedbackSystem consistently provided higher scores than the human tutors, as depicted in Figure \ref{fig:score-differences-text}.
The majority of submissions (39\%) fell within the range of -25\% to 0\%, indicating that the FeedbackSystem assigned slightly higher scores.
A significant portion (24\%) showed more considerable overestimations within the -50\% to -25\% range, while 6\% experienced substantial overestimations ranging from -75\% to -50\%.
In contrast, identical scores were observed for 51 submissions (27\%).
Cases where the FeedbackSystem assigned lower scores than the human tutors were infrequent, with only 4\% showing positive differences, and no submissions exceeded a difference of 50\%.

\subsection{Findings}

%todo: add question numbers to findings
% original RQ1, removed

% The responses to \ref{RQ1} show that the availability of direct automated feedback can significantly enhance student engagement and motivation.
% Students reported feeling more engaged in the learning process.
% The majority of participants also stated they are motivated to repeatedly improve their code and complete their programming assignments.
% Additionally, the feedback encouraged a notable percentage of participants to experiment more with their coding solutions.
% These findings suggest that direct automated feedback is highly effective in boosting both engagement and motivation among students.

% \findingsbox{%
% 	\textbf{Main Findings for RQ1:} The availability of direct automated feedback significantly enhances student engagement and motivation. 
%   Students feel more engaged in the learning process, motivated to improve their code, and encouraged to experiment more with their coding solutions, without having to wait for manual feedback.
% }%
% \vspace{8pt}%

The responses to \ref{RQ1} reveal a strong level of comfort for requesting automated feedback compared to traditional human feedback channels.
Students stated they feel more comfortable requesting automated feedback than from human tutors and were likely to request automated feedback more frequently than from their course professors.
Requesting automated feedback was perceived as less intimidating for most of the participants and all of them stated it is more convenient than arranging meetings with a human tutor.
These findings highlight the effectiveness of automated feedback in providing a more comfortable and accessible feedback mechanism for students.

\findingsbox{%
	\textbf{Main Findings for RQ1:} Students feel more comfortable requesting automated feedback than human feedback. 
  They are likely to request automated feedback more frequently and find it less intimidating and more convenient than arranging meetings with a human tutor.
}%
\vspace{8pt}%

The responses to \ref{RQ2} indicate that students think automated feedback is highly effective in helping them understand and improve their programming assignments. 
Students reported that the feedback helped them understand their mistakes and found it more effective than receiving only one-time feedback for their submission.
The majority reported that the feedback significantly improved the quality of their programming assignments, and all participants stated that automatic feedback is a helpful addition to automatic test case results generated by LP.
In addition, most participants saw continuous access to automated feedback as more beneficial than arranging meetings with a tutor. 
These findings suggest that automated feedback not only aids in error identification but also significantly enhances the overall quality of student assignments.

\findingsbox{%
	\textbf{Main Findings for RQ2:} Students perceive automated feedback as highly effective in helping them understand and improve their programming assignments. 
  The feedback helps them understand their mistakes, improves the quality of their assignments, and is a helpful addition to automatic test case results.
}%
\vspace{8pt}%

The responses to \ref{RQ3} demonstrate the ease of receiving feedback and overall satisfaction with DAFeeD's feedback process and its reference implementation. 
Students found it easy to receive feedback on their programming assignments.
A large number of participants preferred using the feedback integrated into LP than copy their submission and relevant context information over to an external AI tool. 
Most participants also deemed the feedback helpful in improving their programming skills. 
In regard to the overall performance of DAFeeD, the majority of students expressed high satisfaction with the system.
%These findings underscore the high level of user satisfaction and the perceived effectiveness of Artemis' automated feedback in enhancing programming skills.
%TODO: free text answers including improvements

\findingsbox{%
	\textbf{Main Findings for RQ3:} Students find it easy to receive feedback on their programming assignments and are satisfied with the overall performance of DAFeeD and its reference implementation. 
  There are some suggestions for improvements, such as better categorization of feedback, more detailed explanations of errors, and prioritization of critical issues.
}%
\vspace{8pt}%

The analysis of the score distributions across submission attempts for the modeling and text exercises revealed an overall improvement in student performance over time.
Average scores increased steadily during the initial attempts, followed by a slight decline in later attempts.
This decline can be attributed to students who achieved high scores early being less likely to submit further, leaving primarily lower-scoring students to continue.
The decreasing number of submissions over time supports this explanation, indicating that students who met their goals early were less inclined to make additional attempts.
The observed improvements suggest that the direct automated feedback provided by the FeedbackSystem effectively supports student learning by guiding them through an iterative improvement process.

\findingsbox{%
	\textbf{Main Findings for RQ4:} Students improved their performance across multiple submission attempts with the feedback provided by the FeedbackSystem.
  The steady increase in scores indicates the effectiveness of automated feedback in facilitating learning.
}%
\vspace{8pt}%

The comparison of scores between the FeedbackSystem and human tutors for the modeling and text exercises revealed distinct trends.
In the modeling exercise, the FeedbackSystem generally assigned lower scores compared to human tutors, suggesting a more conservative grading approach or limitations in the system's ability to fully interpret complex submissions.
Conversely, in the text exercise, the FeedbackSystem tended to provide higher scores compared to human tutors.
Despite these trends, the analysis showed that over 52\% of modeling submissions and more than 69\% of text submissions exhibited either no difference or a 1-point difference between the FeedbackSystem and human tutors.
These results suggest that the quality of the feedback provided by the FeedbackSystem is comparable to that of human tutors, although some discrepancies remain and present opportunities for future improvement.
%Highlight importance of grading critera, criteria wie bei manueller korrektur wichtig
Furthermore, we noticed the critical role of well-defined grading criteria in enhancing the quality of automated feedback.
Similar to manual assessment by human tutors, clear and course-specific grading criteria are essential for ensuring consistency and relevance in automated evaluations.

\findingsbox{%
	\textbf{Main Findings for RQ5:} The FeedbackSystem generally assigned lower scores in modeling exercises and higher scores in text exercises.
  Over half of the submissions showed minimal score differences, indicating comparable feedback quality.
}%
\vspace{8pt}%


\subsection{Discussion}

Overall, students received DAFeeD's reference implementation FeedbackSystem integrated into LP positively, indicating its effectiveness in enhancing their skills.
% Had to be removed as it relates to the original RQ1
% The high levels of engagement, motivation, and satisfaction reported by the students underscore the system's potential to significantly improve the learning experience.
% TODO Max: Integrate remaining findings from the survey
The iterative submission process, supported by automated feedback, allowed students to refine their work progressively, leading to measurable improvements in their performance across multiple submission attempts.

The findings validate the role of automated feedback in fostering an interactive learning environment.
Students reported feeling more comfortable requesting automated feedback than human feedback, highlighting the system's convenience and accessibility.
This interactive aspect of learning, facilitated by timely and continuous feedback, helps students develop critical problem-solving skills and deepen their understanding of programming concepts.

Despite the overall positive reception, there is room for improving the quality of the feedback provided.
The comparative analysis of FeedbackSystem and human tutor scores highlighted certain discrepancies, with the system generally assigning lower scores in modeling exercises and higher scores in text exercises.
While over half of the submissions showed only minimal differences between the automated and human evaluations, the remaining inconsistencies suggest that refinements are needed to better align the automated scoring with human judgment.
% unbedingt reinschreiben mit tutorenevaluation ground truth, aber tutoren auch nicht immer richtig
Here it is important to note that while tutor assessments are treated as ground truth data in this analysis, but human grading is not infallible.
The approach presented in this paper primarily supports the concept of formative assessment and is particularly effective when implemented fully automatically, especially in contexts where summative assessment is not the primary focus.

While students found the feedback helpful in understanding their mistakes and improving their assignments, we need to further refine it to enhance its precision and relevance.
Incorporating more advanced AI techniques, such as Chain-of-Thought \cite{wei:2022:ChainofThoughtPromptingElicits}, ReAct \cite{yao:2023:ReActSynergizingReasoning}, Retrieval-Augmented Generation (RAG) \cite{gao:2024:RetrievalAugmentedGenerationLarge,ardimento:2024:TeachingUMLUsinga},
could address these issues, offering more tailored and context-specific guidance to students.
Specifically, a RAG approach could enhance FeedbackSystem by retrieving relevant course materials, such as lecture slides, course discussions, and past feedback examples, to provide more contextually accurate and tailored responses that align with the course's learning objectives, offering students clearer explanations and actionable suggestions for improvement.

This work aligns with findings from other studies, highlighting the importance of timely and continuous feedback in education \cite{shute:2008:FocusFormativeFeedback, dawson:2019:WhatMakesEffective}.
Additionally, we believe that direct automated feedback should complement, rather than substitute, traditional human feedback.
The convenience and accessibility of automated feedback are clear advantages, yet the nuanced insights that human tutors provide remain irreplaceable, including emotional support and empathy.

As the study is intended as the treatment validation stage for the proposed feedback process and should also provide important insights into the usability of the reference implementation, we assume that the results correctly reflect the tendency despite the rather small sample size for the controlled lab experiment. 
\citet{nielsen:2000:WhyYouOnly} has shown that smaller, iterative tests can reveal the majority of usability issues, making such an approach both practical and effective.

% Possible Problems with AI in education
We acknowledge that different challenges exist when using AI in education, and we are aware of their potential to negatively influence the learning experience of the diverse student body \cite{kasneci:2023:ChatGPTGoodOpportunitiesa}.
First, the competent use of AI must be explicitly taught to both educators and students to ensure effective and ethical utilization.
Addressing the issue of AI hallucinations, a RAG approach can be employed to enhance accuracy \cite{gao:2024:RetrievalAugmentedGenerationLarge}.
To mitigate over-reliance on AI, it is crucial to encourage students to independently solve certain problems, fostering critical thinking and problem-solving skills.
Bias in AI can be reduced by using models that are designed to minimize prejudices, ensuring fairness in feedback.
Additionally, privacy and cost concerns can be alleviated by deploying locally trained open-source models, which offer greater control over data and reduce dependency on commercial AI solutions.
These measures collectively promote a balanced and responsible integration of AI in the educational process.

In conclusion, DAFeeD shows significant potential in improving computing education.
By addressing the areas identified for improvement and continuing to refine the system, we can harness the full benefits of automated feedback to support and enhance student learning.


\subsection{Limitations}
We follow the categorization framework proposed by Runeson and Höst \cite{runeson:2009:GuidelinesConductingReporting} to outline the limitations of the conducted evaluation, addressing potential threats to internal, external, and construct validity:

% internal validity
% copilot: Internal validity threats include the potential for confounding variables, such as the students' prior experience with automated feedback systems, which could influence their perceptions of the DAFeeD system.
\textit{Internal Validity}: This study may be compromised by using self-reported survey data, which can introduce biases. 
Participants' perceptions may be influenced by their individual attitudes or varying levels of familiarity with programming concepts, leading to potential inaccuracies.
Additionally, the participants' perceived effectiveness does not necessarily correspond to objective effectiveness.
The assumption that human tutor assessments are entirely accurate may affect the validity of our findings, as tutors can introduce subjective biases or errors in their evaluations.

% external validity
\textit{External Validity}: Threats to external validity arise from the specific context of this study. 
Conducting the research exclusively at a single university and with students from computer science, information systems, and similar programs restricts the diversity of the sample. 
This narrow focus may limit the applicability of the findings to other educational settings or student populations. 
In addition, the small sample size of the controlled lab experiment may also limit the generalizability of some findings. 

% construct validity
\textit{Construct Validity}: 
The survey questions designed to evaluate 'perceived effectiveness' and 'comfort with feedback source' may not fully encompass the breadth of these constructs.
Factors such as prior experiences and personal preferences, which the survey does not account for, could influence participants' responses and perceptions.
The use of a specific sample exercise to evaluate the automated feedback process may align more closely with some students' prior experiences or learning styles, introducing bias into the results.


\section{Conclusion} % 1 page
\label{sec:conclusion}

% todo: summarize the contributions and outcome
The main contributions of this paper include the introduction of DAFeeD, a direct automated feedback delivery system, and the development of its reference implementation, FeedbackSystem, which integrates DAFeeD into the learning platform LP.
In addition, we conducted a comprehensive evaluation to demonstrate the system's effectiveness in improving student performance and engagement through iterative feedback.
DAFeeD enables the interactive learning process by providing students with immediate, context-specific feedback on their submissions.
The implementation of DAFeeD demonstrates the feasibility and advantages of incorporating automated feedback systems into computing education. 
For educators, this can alleviate the heavy workload associated with providing individualized feedback, allowing them to focus more on instructional design and student support.
For students, the evaluation of DAFeeD showed that they perceive the automated feedback as effective, helpful, and easy to use, and that it enhances their engagement and motivation.

%future work
% 
Future work includes enhancing the visualization of feedback by grouping and color-coding feedback items to differentiate between critical feedback, suggestions for improvement, and positive comments. 
A priority will be further improving the quality of the feedback provided.
Integrating direct automated feedback into other courses and universities, and empirically evaluating its effects on learning experience, performance, and motivation is another crucial step.
Further, we plan to consider potential issues with AI in education, such as over-reliance and ethical implications in the learning process.

\appendix

\section{Acknowledgments}
This paper includes sections revised with the help of ChatGPT to enhance clarity and readability.
We have carefully reviewed all text produced by AI.
The datasets for the evaluation discussed in this paper are available on OSF\footnote{\url{https://osf.io/p9szv/?view_only=7d0b9889657244dcaee896070d7c15da}}.

\newpage
\balance
%%
%% The next two lines define the bibliography style to be used, and
%% the bibliography file.
\bibliographystyle{ACM-Reference-Format}
\bibliography{main}


\end{document}
\endinput
%%