@article{Zhang1474896, author = {Zhang, Long and Morin, Brice and Haller, Philipp and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {SINTEF Digital, Oslo, Norway}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20230630}, number = {11}, pages = {2534--2548}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Chaos Engineering System for Live Analysis and Falsification of Exception-handling in the JVM}, volume = {47}, DOI = {10.1109/TSE.2019.2954871}, keywords = {dynamic analysis, exception-handling, production systems, chaos engineering}, abstract = {Software systems contain resilience code to handle those failures and unexpected events happening in production. It is essential for developers to understand and assess the resilience of their systems. Chaos engineering is a technology that aims at assessing resilience and uncovering weaknesses by actively injecting perturbations in production. In this paper, we propose a novel design and implementation of a chaos engineering system in Java called ChaosMachine. It provides a unique and actionable analysis on exception-handling capabilities in production, at the level of try-catch blocks. To evaluate our approach, we have deployed ChaosMachine on top of 3 large-scale and well-known Java applications totaling 630k lines of code. Our results show that ChaosMachine reveals both strengths and weaknesses of the resilience code of a software system at the level of exception handling. }, year = {2021} } @unpublished{Ye1268031, author = {Ye, He and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20181206}, title = {A Comprehensive Study of Automatic Program Repair on the QuixBugs Benchmark}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an automatic repair experiment on a benchmark called QuixBugs that has been recently published. This benchmark has never been studied in the context of program repair. In this study, we report on the characteristics of QuixBugs, and we design and perform an experiment about the effectiveness of test-suite based program repair on QuixBugs. We study two repair systems, Astor and Nopol, which are representatives of generate-and-validate repair technique and synthesis repair technique respectively. We propose three patch correctness assessment techniques to comprehensively study overfitting and incorrect patches. Our key results are: 1) 13/40 buggy programs in the QuixBugs can be repaired with a test-suite adequate patch; 2) a total of 22 different plausible patches for those 13 buggy programs in the QuixBugs are present in the search space of the considered tools; 3) the three patch assessment techniques discard in total 12/22 patches that are overfitting. This sets a baseline for future research of automatic repair on QuixBugs. Our experiment also highlights the major properties and challenges of how to perform automated correctness assessment of program repair patches. All experimental results are publicly available on Github in order to facilitate future research on automatic program repair. }, year = {} } @inproceedings{Ye1782822, author = {Ye, He and Martinez, Matias and Durieux, Thomas and Monperrus, Martin}, booktitle = {IBF 2019 : 2019 IEEE 1st International Workshop on Intelligent Bug Fixing}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {Part of ISBN 9781728118093QC 20230921}, pages = {1--10}, eid = {8665475}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Comprehensive Study of Automatic Program Repair on the QuixBugs Benchmark}, DOI = {10.1109/IBF.2019.8665475}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an automatic repair experiment on a benchmark called QuixBugs that has never been studied in the context of program repair. In this study, we report on the characteristics of QuixBugs, and study five repair systems, Arja, Astor, Nopol, NPEfix and RSRepair, which are representatives of generate-and-validate repair techniques and synthesis repair techniques. We propose three patch correctness assessment techniques to comprehensively study overfitting and incorrect patches. Our key results are: 1) 15 / 40 buggy programs in the QuixBugs can be repaired with a test-suite adequate patch; 2) a total of 64 plausible patches for those 15 buggy programs in the QuixBugs are present in the search space of the considered tools; 3) the three patch assessment techniques discard in total 33 / 64 patches that are overfitting. This sets a baseline for future research of automatic repair on QuixBugs. Our experiment also highlights the major properties and challenges of how to perform automated correctness assessment of program repair patches. All experimental results are publicly available on Github in order to facilitate future research on automatic program repair. }, year = {2019} } @article{Ye1506248, author = {Ye, He and Martinez, Matias and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Université Polytechnique Hauts-de-France, France}, journal = {Journal of Systems and Software}, note = {QC 20201202}, eid = {110825}, publisher = {Elsevier Inc.}, title = {A comprehensive study of automatic program repair on the QuixBugs benchmark}, volume = {171}, DOI = {10.1016/j.jss.2020.110825}, keywords = {Automatic program repair, Bug benchmark, Patch correctness assessment, Software engineering, Assessment technique, Automatic programs, Empirical studies, External validities, Overfitting, Repair tools, Research communities, Automatic test pattern generation}, abstract = {Automatic program repair papers tend to repeatedly use the same benchmarks. This poses a threat to the external validity of the findings of the program repair research community. In this paper, we perform an empirical study of automatic repair on a benchmark of bugs called QuixBugs, which has been little studied. In this paper, (1) We report on the characteristics of QuixBugs; (2) We study the effectiveness of 10 program repair tools on it; (3) We apply three patch correctness assessment techniques to comprehensively study the presence of overfitting patches in QuixBugs. Our key results are: (1) 16/40 buggy programs in QuixBugs can be repaired with at least a test suite adequate patch; (2) A total of 338 plausible patches are generated on the QuixBugs by the considered tools, and 53.3% of them are overfitting patches according to our manual assessment; (3) The three automated patch correctness assessment techniques, RGTEvosuite, RGTInputSampling and GTInvariants, achieve an accuracy of 98.2%, 80.8% and 58.3% in overfitting detection, respectively. To our knowledge, this is the largest empirical study of automatic repair on QuixBugs, combining both quantitative and qualitative insights. All our empirical results are publicly available on GitHub in order to facilitate future research on automatic program repair.  }, year = {2021} } @article{SotoValero1547362, author = {Soto Valero, Cesar and Harrand, Nicolas and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Empirical Software Engineering}, note = {QC 20210519}, number = {3}, eid = {45}, publisher = {Springer Nature}, title = {A comprehensive study of bloated dependencies in the Maven ecosystem}, volume = {26}, DOI = {10.1007/s10664-020-09914-8}, keywords = {Dependency management, Software reuse, Debloating, Program analysis}, abstract = {Build automation tools and package managers have a profound influence on software development. They facilitate the reuse of third-party libraries, support a clear separation between the application's code and its external dependencies, and automate several software development tasks. However, the wide adoption of these tools introduces new challenges related to dependency management. In this paper, we propose an original study of one such challenge: the emergence of bloated dependencies. Bloated dependencies are libraries that are packaged with the application's compiled code but that are actually not necessary to build and run the application. They artificially grow the size of the built binary and increase maintenance effort. We propose DepClean, a tool to determine the presence of bloated dependencies in Maven artifacts. We analyze 9,639 Java artifacts hosted on Maven Central, which include a total of 723,444 dependency relationships. Our key result is as follows: 2.7% of the dependencies directly declared are bloated, 15.4% of the inherited dependencies are bloated, and 57% of the transitive dependencies of the studied artifacts are bloated. In other words, it is feasible to reduce the number of dependencies of Maven artifacts to 1/4 of its current count. Our qualitative assessment with 30 notable open-source projects indicates that developers pay attention to their dependencies when they are notified of the problem. They are willing to remove bloated dependencies: 21/26 answered pull requests were accepted and merged by developers, removing 140 dependencies in total: 75 direct and 65 transitive. }, year = {2021} } @article{Ginelli1660543, author = {Ginelli, Davide and Martinez, Matias and Mariani, Leonardo and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Milano Bicocca, Milan, Italy.}, institution = {Univ Polytech Hauts de France, Valenciennes, France.}, institution = {Univ Milano Bicocca, Milan, Italy.}, journal = {Empirical Software Engineering}, note = {QC 20220817}, number = {4}, eid = {97}, publisher = {Springer Nature}, title = {A comprehensive study of code-removal patches in automated program repair}, volume = {27}, DOI = {10.1007/s10664-021-10100-7}, keywords = {Automatic program repair, Code-removal patches, Software testing, Debugging}, abstract = {Automatic Program Repair (APR) techniques can promisingly help reduce the cost of debugging. Many relevant APR techniques follow the generate-and-validate approach, that is, the faulty program is iteratively modified with different change operators and then validated with a test suite until a plausible patch is generated. In particular, Kali is a generate-and-validate technique developed to investigate the possibility of generating plausible patches by only removing code. Former studies show that indeed Kali successfully addressed several faults. This paper addresses the single and particular case of code-removal patches in automated program repair. We investigate the reasons and the scenarios that make their creation possible, and the relationship with patches implemented by developers. Our study reveals that code-removal patches are often insufficient to fix bugs, and proposes a comprehensive taxonomy of code-removal patches that provides evidence of the problems that may affect test suites, opening new opportunities for researchers in the field of automatic program repair. }, year = {2022} } @article{Vera-Perez1348179, author = {Vera-Perez, Oscar Luis and Danglot, Benjamin and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Inria Rennes Bretagne Atlantique, Campus Beaulieu 263 Ave Gen Leclerc, F-35042 Rennes, France.}, institution = {Inria Lille Nord Europe, Parc Sci Haute Borne 40,Ave Halley Bat A Pk Plaza, F-59650 Villeneuve Dascq, France.}, journal = {Empirical Software Engineering}, note = {QC 20190903}, number = {3}, pages = {1195--1225}, title = {A comprehensive study of pseudo-tested methods}, volume = {24}, DOI = {10.1007/s10664-018-9653-2}, keywords = {Software testing, Software developers, Pseudo-tested methods, Test quality, Program analysis}, abstract = {Pseudo-tested methods are defined as follows: they are covered by the test suite, yet no test case fails when the method body is removed, i.e., when all the effects of this method are suppressed. This intriguing concept was coined in 2016, by Niedermayr and colleagues, who showed that such methods are systematically present, even in well-tested projects with high statement coverage. This work presents a novel analysis of pseudo-tested methods. First, we run a replication of Niedermayr's study with 28K+ methods, enhancing its external validity thanks to the use of new tools and new study subjects. Second, we perform a systematic characterization of these methods, both quantitatively and qualitatively with an extensive manual analysis of 101 pseudo-tested methods. The first part of the study confirms Niedermayr's results: pseudo-tested methods exist in all our subjects. Our in-depth characterization of pseudo-tested methods leads to two key insights: pseudo-tested methods are significantly less tested than the other methods; yet, for most of them, the developers would not pay the testing price to fix this situation. This calls for future work on targeted test generation to specify those pseudo-tested methods without spending developer time. }, year = {2019} } @inproceedings{Papoudakis1167092, author = {Papoudakis, G. and Preux, P. and Monperrus, Martin}, booktitle = {6th International Conference on Complex Networks and Their Applications, Complex Networks 2017 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université de Lille, CRIStAL & Inria, Villeneuve d’Ascq, France}, institution = {Université de Lille, CRIStAL & Inria, Villeneuve d’Ascq, France}, note = {Part of ISBN 9783319721491QC 20171218}, pages = {531--542}, title = {A generative model for sparse, evolving digraphs}, series = {Studies in Computational Intelligence}, number = {689}, volume = {689}, DOI = {10.1007/978-3-319-72150-7_43}, abstract = {Generating graphs that are similar to real ones is an open problem, while the similarity notion is quite elusive and hard to formalize. In this paper, we focus on sparse digraphs and propose SDG, an algorithm that aims at generating graphs similar to real ones. Since real graphs are evolving and this evolution is important to study in order to understand the underlying dynamical system, we tackle the problem of generating series of graphs. We propose SEDGE, an algorithm meant to generate series of graphs similar to a real series. SEDGE is an extension of SDG. We consider graphs that are representations of software programs and show experimentally that our approach outperforms other existing approaches. Experiments show the performance of both algorithms. }, year = {2018} } @article{Harrand1372273, author = {Harrand, Nicolas and Allier, Simon and Rodriguez-Cancio, Marcelino and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {DGA, Val De Reuil, France.}, institution = {Vanderbildt Univ, Nashville, TN USA.}, journal = {Genetic Programming and Evolvable Machines}, note = {QC 20191122}, number = {4}, pages = {531--580}, title = {A journey among Java neutral program variants}, volume = {20}, DOI = {10.1007/s10710-019-09355-3}, keywords = {Neutral program variant, Program transformation, Java, Code plasticity}, abstract = {Neutral program variants are alternative implementations of a program, yet equivalent with respect to the test suite. Techniques such as approximate computing or genetic improvement share the intuition that potential for enhancements lies in these acceptable behavioral differences (e.g., enhanced performance or reliability). Yet, the automatic synthesis of neutral program variants, through program transformations remains a key challenge. This work aims at characterizing plastic code regions in Java programs, i.e., the code regions that are modifiable while maintaining functional correctness, according to a test suite. Our empirical study relies on automatic variations of 6 real-world Java programs. First, we transform these programs with three state-of-the-art program transformations: add, replace and delete statements. We get a pool of 23,445 neutral variants, from which we gather the following novel insights: developers naturally write code that supports fine-grain behavioral changes; statement deletion is a surprisingly effective program transformation; high-level design decisions, such as the choice of a data structure, are natural points that can evolve while keeping functionality. Second, we design 3 novel program transformations, targeted at specific plastic regions. New experiments reveal that respectively 60%, 58% and 73% of the synthesized variants (175,688 in total) are neutral and exhibit execution traces that are different from the original. }, year = {2019} } @article{Danglot1363140, author = {Danglot, Benjamin and Vera-Perez, Oscar and Yu, Zhongxing and Zaidman, Andy and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {INRIA, Lille, France.}, institution = {INRIA, Rennes, France.}, institution = {Delft Univ Technol, Delft, Netherlands.}, journal = {Journal of Systems and Software}, note = {QC 20191022}, eid = {UNSP 110398}, title = {A snowballing literature study on test amplification}, volume = {157}, DOI = {10.1016/j.jss.2019.110398}, keywords = {Test amplification, Test augmentation, Test optimization, Test regeneration, Automatic testing}, abstract = {The adoption of agile approaches has put an increased emphasis on testing, resulting in extensive test suites. These suites include a large number of tests, in which developers embed knowledge about meaningful input data and expected properties as oracles. This article surveys works that exploit this knowledge to enhance manually written tests with respect to an engineering goal (e.g., improve coverage or refine fault localization). While these works rely on various techniques and address various goals, we believe they form an emerging and coherent field of research, which we coin "test amplification". We devised a first set of papers from DBLP, searching for all papers containing "test" and "amplification" in their title. We reviewed the 70 papers in this set and selected the 4 papers that fit the definition of test amplification. We use them as the seeds for our snowballing study, and systematically followed the citation graph. This study is the first that draws a comprehensive picture of the different engineering goals proposed in the literature for test amplification. We believe that this survey will help researchers and practitioners entering this new field to understand more quickly and more deeply the intuitions, concepts and techniques used for test amplification. }, year = {2019} } @article{Baudry1583272, author = {Baudry, Benoit and Chen, Zimin and Etemadi, Khashayar and Fu, Han and Ginelli, Davide and Kommrusch, Steve and Martinez, Matias and Monperrus, Martin and Ron Arteaga, Javier and Ye, He and Yu, Zhongxing}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Milano Bicocca, Comp Sci, I-20166 Milan, Italy.}, institution = {Colorado State Univ, Machine Learning, Ft Collins, CO 80523 USA.}, institution = {Univ Polytech Hauts De France, F-59260 Valenciennes, France.}, institution = {KTH Royal Inst Technol, Software Engn, S-11428 Stockholm, Sweden.}, institution = {Shandong Univ, Sch Comp Sci & Technol, Jinan 266237, Peoples R China.}, journal = {IEEE Software}, note = {QC 20210805}, number = {4}, pages = {28--35}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {A Software-Repair Robot Based on Continual Learning}, volume = {38}, DOI = {10.1109/MS.2021.3070743}, keywords = {Maintenance engineering, Computer bugs, Software development management, Bot (Internet), Training data, Machine learning}, abstract = {Software bugs are common, and correcting them accounts for a significant portion of the costs in the software development and maintenance process. In this article, we discuss R-Hero, our novel system for learning how to fix bugs based on continual training. }, year = {2021} } @article{Yu1295975, author = {Yu, Zhongxing and Martinez, Matias and Danglot, Benjamin and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, institution = {Univ Valenciennes, Malvache Bldg,Campus Mont Houy, F-59313 Valenciennes 9, France.}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, institution = {Inria Lille Nord Europe, Ave Halley, F-59650 Villeneuve Dascq, France.}, journal = {Empirical Software Engineering}, note = {QC 20190611}, number = {1}, pages = {33--67}, publisher = {SPRINGER}, title = {Alleviating patch overfitting with automatic test generation : a study of feasibility and effectiveness for the Nopol repair system}, volume = {24}, DOI = {10.1007/s10664-018-9619-4}, keywords = {Program repair, Synthesis-based repair, Patch overfitting, Automatic test case generation}, abstract = {Among the many different kinds of program repair techniques, one widely studied family of techniques is called test suite based repair. However, test suites are in essence input-output specifications and are thus typically inadequate for completely specifying the expected behavior of the program under repair. Consequently, the patches generated by test suite based repair techniques can just overfit to the used test suite, and fail to generalize to other tests. We deeply analyze the overfitting problem in program repair and give a classification of this problem. This classification will help the community to better understand and design techniques to defeat the overfitting problem. We further propose and evaluate an approach called UnsatGuided, which aims to alleviate the overfitting problem for synthesis-based repair techniques with automatic test case generation. The approach uses additional automatically generated tests to strengthen the repair constraint used by synthesis-based repair techniques. We analyze the effectiveness of UnsatGuided: 1) analytically with respect to alleviating two different kinds of overfitting issues; 2) empirically based on an experiment over the 224 bugs of the Defects4J repository. The main result is that automatic test generation is effective in alleviating one kind of overfitting, issue-regression introduction, but due to oracle problem, has minimal positive impact on alleviating the other kind of overfitting issue-incomplete fixing. }, year = {2019} } @inproceedings{Durieux1430140, author = {Durieux, T. and Abreu, R. and Monperrus, Martin and Bissyande, T. F. and Cruz, L.}, booktitle = {Proceedings - 2019 IEEE International Conference on Software Maintenance and Evolution, ICSME 2019 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200513Part of ISBN 9781728130941}, pages = {291--295}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {An Analysis of 35+ Million Jobs of Travis CI}, DOI = {10.1109/ICSME.2019.00044}, keywords = {continuous integration, continuous integration usage, TravisCI, Computer programming, Computer software maintenance, Continuous integrations, Corporate users, MicroSoft, Open source developers, Run test, Open source software}, abstract = {Travis CI handles automatically thousands of builds every day to, amongst other things, provide valuable feedback to thousands of open-source developers. In this paper, we investigate Travis CI to firstly understand who is using it, and when they start to use it. Secondly, we investigate how the developers use Travis CI and finally, how frequently the developers change the Travis CI configurations. We observed during our analysis that the main users of Travis CI are corporate users such as Microsoft. And the programming languages used in Travis CI by those users do not follow the same popularity trend than on GitHub, for example, Python is the most popular language on Travis CI, but it is only the third one on GitHub. We also observe that Travis CI is set up on average seven days after the creation of the repository and the jobs are still mainly used (60%) to run tests. And finally, we observe that 7.34% of the commits modify the Travis CI configuration. We share the biggest benchmark of Travis CI jobs (to our knowledge): It contains 35,793,144 jobs from 272,917 different GitHub projects. }, year = {2019} } @article{Danglot1452687, author = {Danglot, Benjamin and Monperrus, Martin and Rudametkin, Walter and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {INRIA, Lille-Nord Europe, 40 Avenue Halley, Villeneuve d’Ascq, 59650, France}, institution = {Université de Lille, 42 rue Paul Duez, 59000, Lille, France}, journal = {Empirical Software Engineering}, note = {QC 20200707}, number = {4}, pages = {2379--2415}, publisher = {Springer Nature}, title = {An approach and benchmark to detect behavioral changes of commits in continuous integration}, volume = {25}, DOI = {10.1007/s10664-019-09794-7}, keywords = {Behavioral change detection, Continuous Integration, Test amplification, Integration, Open source software, Testing, Behavioral changes, Continuous integrations, Development process, Fully automated, Generating variations, Good practices, Search-based, Test amplifications, Software testing}, abstract = {When a developer pushes a change to an application’s codebase, a good practice is to have a test case specifying this behavioral change. Thanks to continuous integration (CI), the test is run on subsequent commits to check that they do no introduce a regression for that behavior. In this paper, we propose an approach that detects behavioral changes in commits. As input, it takes a program, its test suite, and a commit. Its output is a set of test methods that capture the behavioral difference between the pre-commit and post-commit versions of the program. We call our approach DCI (Detecting behavioral changes in CI). It works by generating variations of the existing test cases through (i) assertion amplification and (ii) a search-based exploration of the input space. We evaluate our approach on a curated set of 60 commits from 6 open source Java projects. To our knowledge, this is the first ever curated dataset of real-world behavioral changes. Our evaluation shows that DCI is able to generate test methods that detect behavioral changes. Our approach is fully automated and can be integrated into current development processes. The main limitations are that it targets unit tests and works on a relatively small fraction of commits. More specifically, DCI works on commits that have a unit test that already executes the modified code. In practice, from our benchmark projects, we found 15.29% of commits to meet the conditions required by DCI. }, URL = {https://link.springer.com/article/10.1007%2Fs10664-019-09794-7}, year = {2020} } @article{Martinez1300612, author = {Martinez, M. and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Journal of Systems and Software}, note = {QC 20190329}, pages = {65--80}, title = {Astor : Exploring the design space of generate-and-validate program repair beyond GenProg}, volume = {151}, DOI = {10.1016/j.jss.2019.01.069}, keywords = {Automated Program Repair, Defects, Evaluation Frameworks, Software Bugs, Software Maintenance, Software Testing}, abstract = {This article contributes to defining the design space of program repair. Repair approaches can be loosely characterized according to the main design philosophy, in particular “generate- and-validate” and synthesis-based approaches. Each of those repair approaches is a point in the design space of program repair. Our goal is to facilitate the design, development and evaluation of repair approaches by providing a framework that: a) contains components commonly present in most approaches, b) provides built-in implementations of existing repair approaches. This paper presents a Java framework named Astor that focuses on the design space of generate-and-validate repair approaches. The key novelty of Astor is to provides explicit extension points to explore the design space of program repair. Thanks to those extension points, researchers can both reuse existing program repair components and implement new ones. Astor includes 6 unique implementations of repair approaches in Java, including GenProg for Java called jGenProg. Researchers have already defined new approaches over Astor. The implementations of program repair approaches built already available in Astor are capable of repairing, in total, 98 real bugs from 5 large Java programs. Astor code is publicly available on Github: https://github.com/SpoonLabs/astor. }, year = {2019} } @article{Etemadi1878189, author = {Etemadi, Khashayar and Sharma, Aman and Madeiral, Fernanda and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Vrije Universiteit Amsterdam, Amsterdam, HV, The Netherlands, 1081, HV}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20240701}, number = {11}, pages = {4988--5007}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Augmenting Diffs With Runtime Information}, volume = {49}, DOI = {10.1109/TSE.2023.3324258}, keywords = {Code diff, code review, dynamic program analysis, runtime differencing}, abstract = {Source code diffs are used on a daily basis as part of code review, inspection, and auditing. To facilitate understanding, they are typically accompanied by explanations that describe the essence of what is changed in the program. As manually crafting high-quality explanations is a cumbersome task, researchers have proposed automatic techniques to generate code diff explanations. Existing explanation generation methods solely focus on static analysis, i.e., they do not take advantage of runtime information to explain code changes. In this article, we propose Collector-Sahab, a novel tool that augments code diffs with runtime difference information. Collector-Sahab compares the program states of the original (old) and patched (new) versions of a program to find unique variable values. Then, Collector-Sahab adds this novel runtime information to the source code diff as shown, for instance, in code reviewing systems. As an evaluation, we run Collector-Sahab on 584 code diffs for Defects4J bugs and find it successfully augments the code diff for 95% (555/584) of them. We also perform a user study and ask eight participants to score the augmented code diffs generated by Collector-Sahab. Per this user study, we conclude that developers find the idea of adding runtime data to code diffs promising and useful. Overall, our experiments show the effectiveness and usefulness of Collector-Sahab in augmenting code diffs with runtime difference information. Publicly-available repository: https://github.com/ASSERT-KTH/collector-sahab. }, year = {2023} } @article{Ye1638221, author = {Ye, He and Gu, Jian and Martinez, M. and Durieux, Thomas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20220216}, number = {8}, pages = {2920--2938}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {Automated Classification of Overfitting Patches with Statically Extracted Code Features}, volume = {48}, DOI = {10.1109/TSE.2021.3071750}, keywords = {Automatic program repair, Code features, Feature extraction, Maintenance engineering, Overfitting patch, Patch assessment, Predictive models, Software, Syntactics, Tools, Training}, abstract = {Automatic program repair (APR) aims to reduce the cost of manually fixing software defects. However, APR suffers from generating a multitude of overfitting patches, those patches that fail to correctly repair the defect beyond making the tests pass. This paper presents a novel overfitting patch detection system called ODS to assess the correctness of APR patches. ODS first statically compares a patched program and a buggy program in order to extract code features at the abstract syntax tree (AST) level. Then, ODS uses supervised learning with the captured code features and patch correctness labels to automatically learn a probabilistic model. The learned ODS model can then finally be applied to classify new and unseen program repair patches. We conduct a large-scale experiment to evaluate the effectiveness of ODS on patch correctness classification based on 10,302 patches from Defects4J, Bugs.jar and Bears benchmarks. The empirical evaluation shows that ODS is able to correctly classify 71.9% of program repair patches from 26 projects, which improves the state-of-the-art. ODS is applicable in practice and can be employed as a post-processing procedure to classify the patches generated by different APR systems.  }, year = {2022} } @article{Ye1542036, author = {Ye, He and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Valenciennes, Valenciennes, France.}, journal = {Empirical Software Engineering}, note = {QC 20210406}, number = {2}, eid = {20}, publisher = {Springer Nature}, title = {Automated patch assessment for program repair at scale}, volume = {26}, DOI = {10.1007/s10664-020-09920-w}, keywords = {Automatic program repair, Automatic patch assessment}, abstract = {In this paper, we do automatic correctness assessment for patches generated by program repair systems. We consider the human-written patch as ground truth oracle and randomly generate tests based on it, a technique proposed by Shamshiri et al., called Random testing with Ground Truth (RGT) in this paper. We build a curated dataset of 638 patches for Defects4J generated by 14 state-of-the-art repair systems, we evaluate automated patch assessment on this dataset. The results of this study are novel and significant: First, we improve the state of the art performance of automatic patch assessment with RGT by 190% by improving the oracle; Second, we show that RGT is reliable enough to help scientists to do overfitting analysis when they evaluate program repair systems; Third, we improve the external validity of the program repair knowledge with the largest study ever. }, year = {2021} } @unpublished{Zhang1437439, author = {Zhang, Long and Tiwari, Deepika and Morin, Brice and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20210113}, title = {Automatic Observability for Dockerized Java Applications}, keywords = {observability, fault injection, dynamic analysis, software resilience, Docker}, abstract = {Docker is a virtualization technique heavily used in industry to build cloud-based systems. In this context, observability means that it is hard for engineers to get timely and accurate information about the running state in production, due to scale and virtualization. In this paper, we present a novel approach, called POBS, to automatically improve observability of Dockerized Java applications. POBS is based on automated transformations of Docker configuration files. Our approach injects additional modules in the production application, for providing better observability and for supporting fault injection. We evaluate POBS with open-source Java applications. Our key result is that 564/880 (64%) of Docker configuration files can be automatically augmented with better observability. This calls for more research on automated transformation techniques in the Docker ecosystem. }, URL = {https://arxiv.org/abs/1912.06914}, } @article{Danglot1365812, author = {Danglot, Benjamin and Vera-Perez, Oscar Luis and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Inria Lille Nord Europe, Parc Sci Haute Borne 40,Ave Halley,Bat A,Pk Plaza, F-59650 Villeneuve Dascq, France.}, institution = {Inria Rennes Bretagne Atlantique, Campus Beaulieu,263 Ave Gen Leclerc, F-35042 Rennes, France.}, journal = {Empirical Software Engineering}, note = {QC 20191025}, number = {4}, pages = {2603--2635}, title = {Automatic test improvement with DSpot : a study with ten mature open-source projects}, volume = {24}, DOI = {10.1007/s10664-019-09692-y}, keywords = {Test improvement, Junit test, Pull request empirical study}, abstract = {In the literature, there is a rather clear segregation between manually written tests by developers and automatically generated ones. In this paper, we explore a third solution: to automatically improve existing test cases written by developers. We present the concept, design and implementation of a system called DSpot, that takes developer-written test cases as input (JUnit tests in Java) and synthesizes improved versions of them as output. Those test improvements are given back to developers as patches or pull requests, that can be directly integrated in the main branch of the test code base. We have evaluated DSpot in a deep, systematic manner over 40 real-world unit test classes from 10 notable and open-source software projects. We have amplified all test methods from those 40 unit test classes. In 26/40 cases, DSpot is able to automatically improve the test under study, by triggering new behaviors and adding new valuable assertions. Next, for ten projects under consideration, we have proposed a test improvement automatically synthesized by DSpot to the lead developers. In total, 13/19 proposed test improvements were accepted by the developers and merged into the main code base. This shows that DSpot is capable of automatically improving unit-tests in real-world, large scale Java software. }, year = {2019} } @inproceedings{Madeiral1322959, author = {Madeiral, F. and Urli, S. and Maia, M. and Monperrus, Martin}, booktitle = {SANER 2019 - Proceedings of the 2019 IEEE 26th International Conference on Software Analysis, Evolution, and Reengineering : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20190611Part of ISBN 9781728105918}, pages = {468--478}, eid = {8667991}, title = {BEARS : An Extensible Java Bug Benchmark for Automatic Program Repair Studies}, DOI = {10.1109/SANER.2019.8667991}, keywords = {Java programming language, Open source software, Pipelines, Reengineering, Repair, Software testing, Automatic creations, Automatic programs, Bug tracking system, Continuous integrations, Open source projects, Repair tools, Research communities, Test failure, Program debugging}, abstract = {Benchmarks of bugs are essential to empirically evaluate automatic program repair tools. In this paper, we present BEARS, a project for collecting and storing bugs into an extensible bug benchmark for automatic repair studies in Java. The collection of bugs relies on commit building state from Continuous Integration (CI) to find potential pairs of buggy and patched program versions from open-source projects hosted on GitHub. Each pair of program versions passes through a pipeline where an attempt of reproducing a bug and its patch is performed. The core step of the reproduction pipeline is the execution of the test suite of the program on both program versions. If a test failure is found in the buggy program version candidate and no test failure is found in its patched program version candidate, a bug and its patch were successfully reproduced. The uniqueness of Bears is the usage of CI (builds) to identify buggy and patched program version candidates, which has been widely adopted in the last years in open-source projects. This approach allows us to collect bugs from a diversity of projects beyond mature projects that use bug tracking systems. Moreover, BEARS was designed to be publicly available and to be easily extensible by the research community through automatic creation of branches with bugs in a given GitHub repository, which can be used for pull requests in the BEARS repository. We present in this paper the approach employed by BEARS, and we deliver the version 1.0 of BEARS, which contains 251 reproducible bugs collected from 72 projects that use the Travis CI and Maven build environment. }, year = {2019} } @inproceedings{ReyesGarcía1932571, author = {Reyes García, Frank and Baudry, Benoit and Monperrus, Martin}, booktitle = {Proceedings - 2024 IEEE International Conference on Source Code Analysis and Manipulation, SCAM 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université de Montréal, Montréal, Canada}, note = {Part of ISBN 9798331528508QC 20260414}, pages = {36--46}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Breaking-Good : Explaining Breaking Dependency Updates with Build Analysis}, DOI = {10.1109/SCAM63643.2024.00014}, keywords = {Breaking dependency updates, Explanations, Java, Maven, Software Dependency}, abstract = {Dependency updates often cause compilation errors when new dependency versions introduce changes that are incompatible with existing client code. Fixing breaking dependency updates is notoriously hard, as their root cause can be hidden deep in the dependency tree. We present Breaking-Good, a tool that automatically generates explanations for breaking updates. Breaking-Good provides a detailed categorization of compilation errors, identifying several factors related to changes in direct and indirect dependencies, incompatibilities between Java versions, and client-specific configuration. With a blended analysis of log and dependency trees, Breaking-Good generates detailed explanations for each breaking update. These explanations help developers understand the causes of the breaking update, and suggest possible actions to fix the breakage. We evaluate Breaking-Good on 243 real-world breaking dependency updates. Our results indicate that Breaking-Good accurately identifies root causes and generates automatic explanations for 70 % of these breaking updates. Our user study demonstrates that the generated explanations help developers. Breaking-Good is the first technique that automatically identifies the causes of a breaking dependency update and explains the breakage accordingly. }, year = {2024} } @inproceedings{ReyesGarcía1888722, author = {Reyes García, Frank and Gamage, Yogya and Skoglund, Gabriel and Baudry, Benoit and Monperrus, Martin}, booktitle = {Proceedings - 2024 IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = { Part of ISBN 9798350330663QC 20240823}, pages = {159--170}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {BUMP : A Benchmark of Reproducible Breaking Dependency Updates}, DOI = {10.1109/SANER60148.2024.00024}, keywords = {Benchmark, Breaking dependency updates, Dependency engineering, Java, Maven, Reproducibility}, abstract = {Third-party dependency updates can cause a build to fail if the new dependency version introduces a change that is incompatible with the usage: this is called a breaking dependency update. Research on breaking dependency updates is active, with works on characterization, understanding, automatic repair of breaking updates, and other software engineering aspects. All such research projects require a benchmark of breaking updates that has the following properties: 1) it contains real-world breaking updates; 2) the breaking updates can be executed; 3) the benchmark provides stable scientific artifacts of breaking updates over time, a property we call 'reproducibility'. To the best of our knowledge, such a benchmark is missing. To address this problem, we present BUMP, a new benchmark that contains reproducible breaking dependency updates in the context of Java projects built with the Maven build system. BUMP contains 571 breaking dependency updates collected from 153 Java projects. BUMP ensures long-term reproducibility of dependency updates on different platforms, guaranteeing consistent build failures. We categorize the different causes of build breakage in BUMP, providing novel insights for future work on breaking update engineering. To our knowledge, BUMP is the first of its kind, providing hundreds of real-world breaking updates that have all been made reproducible. }, year = {2024} } @article{Sharma2044960, author = {Sharma, Aman and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science}, institution = {Université de Montréal, Montréal, Canada, H3T 1J4}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20260311}, number = {1}, pages = {54--69}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Causes and Canonicalization of Unreproducible Builds in Java}, volume = {52}, DOI = {10.1109/TSE.2025.3627891}, keywords = {Java, Reproducible builds, canonicalization, software supply chain}, abstract = {The increasing complexity of software supply chains and the rise of supply chain attacks have elevated concerns around software integrity. Users and stakeholders face significant challenges in validating that a given software artifact corresponds to its declared source. Reproducible Builds address this challenge by ensuring that independently performed builds from identical source code produce identical binaries. However, achieving reproducibility at scale remains difficult, especially in Java, due to a range of non-deterministic factors and caveats in the build process. In this work, we focus on reproducibility in Java-based software, archetypal of enterprise applications. We introduce a conceptual framework for reproducible builds, we analyze a large dataset from Reproducible Central, and we develop a novel taxonomy of six root causes of unreproducibility. We study actionable mitigations: artifact and bytecode canonicalization using OSS-Rebuild and jNorm respectively. Finally, we present Chains-Rebuild (improvements to OSS-Rebuild), a tool that raises reproducibility success from 9.48% to 26.60% on 12,803 unreproducible artifacts. To sum up, our contributions are the first large-scale taxonomy of build unreproducibility causes in Java, a publicly available dataset of unreproducible builds, and Chains-Rebuild, a canonicalization tool for mitigating unreproducible builds in Java. }, year = {2026} } @article{Balliu1842523, author = {Balliu, Musard and Baudry, Benoit and Bobadilla, Sofia and Ekstedt, Mathias and Monperrus, Martin and Ron Arteaga, Javier and Sharma, Aman and Skoglund, Gabriel and Soto Valero, C{\’e;}sar and Wittlinger, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Network and Systems Engineering}, journal = {IEEE Security and Privacy}, note = {QC 20240314}, number = {6}, pages = {12--23}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Challenges of Producing Software Bill of Materials for Java}, volume = {21}, DOI = {10.1109/MSEC.2023.3302956}, keywords = {Java, Software, Production, Supply chain management, Standards, Bills of materials, Software reliability}, abstract = {Software bills of materials (SBOMs) promise to become the backbone of software supply chain hardening. We deep-dive into six tools and the SBOMs they produce for complex open source Java projects, revealing challenges regarding the accurate production and usage of SBOMs. }, year = {2023} } @unpublished{Zhang1639560, author = {Zhang, Long and Ron Arteaga, Javier and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {QC 20220222}, title = {Chaos Engineering of Ethereum Blockchain Clients}, DOI = {10.48550/arXiv.2111.00221}, keywords = {chaos engineering, Ethereum, fault injection, resilience benchmarking}, abstract = {The Ethereum blockchain is the operational backbone of major decentralized finance platforms. As such, it is expected to be exceptionally reliable. In this paper, we present ChaosETH, a chaos engineering tool for resilience assessment of Ethereum clients. ChaosETH operates in the following manner: First, it monitors Ethereum clients to determine their normal behavior. Then, it injects system call invocation errors into the Ethereum clients and observes the resulting behavior under perturbation. Finally, ChaosETH compares the behavior recorded before, during, and after perturbation to assess the impact of the injected system call invocation errors. The experiments are performed on the two most popular Ethereum client implementations: GoEthereum and OpenEthereum. We experiment with 22 different types of system call invocation errors. We assess their impact on the Ethereum clients with respect to 15 application-level metrics. Our results reveal a broad spectrum of resilience characteristics of Ethereum clients in the presence of system call invocation errors, ranging from direct crashes to full resilience. The experiments clearly demonstrate the feasibility of applying chaos engineering principles to blockchains. }, URL = {https://doi.org/10.48550/arXiv.2111.00221}, } @article{Yu1564402, author = {Yu, Zhongxing and Bai, Chenggang and Seinturier, Lionel and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Beihang Univ, Dept Automat Control, Beijing Univ Aeronaut & Astronaut, Beijing 100191, Peoples R China.;Beihang Univ, Dept Automat Control, Beijing 100191, Peoples R China.}, institution = {Inria Lille Nord Europe, F-59650 Villeneuve Dascq, France.;Univ Lille, Comp Sci, F-59000 Lille, France.;Univ Lille, Comp Sci Dept, F-59000 Lille, France.}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20210611}, number = {5}, pages = {969--986}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Characterizing the Usage, Evolution and Impact of Java Annotations in Practice}, volume = {47}, DOI = {10.1109/TSE.2019.2910516}, keywords = {Annotations, Java, Tools, Libraries, Runtime, Open source software, Annotation, software evolution, empirical study, statistical modelling}, abstract = {Annotations have been formally introduced into Java since Java 5. Since then, annotations have been widely used by the Java community for different purposes, such as compiler guidance and runtime processing. Despite the ever-growing use, there is still limited empirical knowledge about the actual usage of annotations in practice, the changes made to annotations during software evolution, and the potential impact of annotations on code quality. To fill this gap, we perform the first large-scale empirical study about Java annotations on 1,094 notable open-source projects hosted on GitHub. Our study systematically investigates annotation usage, annotation evolution, and annotation impact, and generates 10 novel and important findings. We also present the implications of our findings, which shed light for developers, researchers, tool builders, and language or library designers in order to improve all facets of Java annotation engineering. }, year = {2021} } @unpublished{Hidvegi1907148, author = {Hidv{\’e;}gi, D{\’a;}vid and Etemadi, Khashayar and Bobadilla, Sofia and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20241023}, title = {CigaR: Cost-efficient Program Repair with LLMs}, } @inproceedings{Martinez1421426, author = {Martinez, Matias and Monperrus, Martin}, booktitle = {2019 IEEE/ACM 41st International Conference on Software Engineering : Companion Proceedings (ICSE-Companion 2019)}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Polytech Hauts De France, Valenciennes, France.}, note = {QC 20200402Part of ISBN 978-1-7281-1764-5; 978-1-7281-1765-2 }, pages = {79--82}, publisher = {IEEE}, title = {Coming : Tool for Mining Change Pattern Instances from Git Commits}, DOI = {10.1109/ICSE-Companion.2019.00043}, abstract = {Software repositories such as Git have become a relevant source of information for software engineer researchers. For instance, the detection of commits that fulfill a given criterion (e.g., bugfixing commits) is one of the most frequent tasks done to understand the software evolution. However, to our knowledge, there is no open-source tool that, given a Git repository, returns all the instances of a given code change pattern. In this paper we present Coming, a tool that takes as input a Git repository and mines instances of code change patterns present on each commit. For that, Coming computes fine-grained code changes between two consecutive revisions, analyzes those changes to determine if they correspond to an instance of a change pattern (specified by the user using XML), and finally, after analyzing all the commits, it presents a) the frequency of code changes and b) the instances found in each commit. We evaluate Coming on a set of 28 pairs of revisions from Defects4J, finding instances of change patterns that involve If conditions on 26 of them. }, URL = {https://2019.icse-conferences.org/}, URL = {https://ieeexplore.ieee.org/xpl/conhome/8790387/proceeding}, year = {2019} } @inproceedings{Danglot1280393, author = {Danglot, Benjamin and Preux, Philippe and Baudry, Benoit and Monperrus, Martin}, booktitle = {PROCEEDINGS 2018 IEEE/ACM 40TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE) : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20190118}, pages = {481--481}, publisher = {IEEE}, title = {Correctness Attraction : A Study of Stability of Software Behavior Under Runtime Perturbation}, DOI = {10.1145/3180155.3182548}, year = {2018} } @article{Danglot1269676, author = {Danglot, Benjamin and Preux, Philippe and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Centre for Advanced Software Technology Research (CASTOR)}, journal = {Empirical Software Engineering}, note = {QC 20181211}, number = {4}, pages = {2086--2119}, title = {Correctness attraction : a study of stability of software behavior under runtime perturbation}, volume = {23}, DOI = {10.1007/s10664-017-9571-8}, keywords = {Perturbation analysis; Software correctness; Empirical study}, abstract = {Can the execution of software be perturbed without breaking the correctness of the output? In this paper, we devise a protocol to answer this question from a novel perspective. In an experimental study, we observe that many perturbations do not break the correctness in ten subject programs. We call this phenomenon “correctness attraction”. The uniqueness of this protocol is that it considers a systematic exploration of the perturbation space as well as perfect oracles to determine the correctness of the output. To this extent, our findings on the stability of software under execution perturbations have a level of validity that has never been reported before in the scarce related work. A qualitative manual analysis enables us to set up the first taxonomy ever of the reasons behind correctness attraction. }, year = {2018} } @inproceedings{CabreraArteaga1543343, author = {Cabrera Arteaga, Javier and Floros, Orestis and Vera Perez, Oscar and Baudry, Benoit and Monperrus, Martin}, booktitle = { : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Rennes, Inria, CNRS, IRISA}, note = {Part of proceedings: ISBN 1-891562-66-5, QC 20230117}, title = {CROW: Code Diversification for WebAssembly}, DOI = {10.14722/madweb.2021.23004}, keywords = {WebAssembly, Web, Diversification}, abstract = {The adoption of WebAssembly increases rapidly, as it provides a fast and safe model for program execution in the browser. However, WebAssembly is not exempt from vulnerabilities that can be exploited by malicious observers. Code diversification can mitigate some of these attacks. In this paper, we present the first fully automated workflow for the diversification of WebAssembly binaries. We present CROW, an open-source tool implementing this workflow through enumerative synthesis of diverse code snippets expressed in the LLVMintermediate representation. We evaluate CROW’s capabilitieson303C programs and study its use on a real-life security-sensitive program: libsodium, a modern cryptographic library. Overall, CROW is able to generate diverse variants for239out of303 (79%)small programs. Furthermore, our experiments show that our approach and tool is able to successfully diversify off-the-shelf cryptographic software (libsodium). }, URL = {https://dx.doi.org/10.14722/madweb.2021.23004}, year = {2021} } @inproceedings{Vera-Perez1673477, author = {Vera-P{\’e;}rez, O. L. and Monperrus, Martin and Baudry, Benoit}, booktitle = {ASE 2018 - Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {Part of proceedings: ISBN 978-1-4503-5937-5QC 20220621}, pages = {908--911}, publisher = {Association for Computing Machinery (ACM)}, title = {Descartes : A pitest engine to detect pseudo-tested methods: Tool demonstration}, DOI = {10.1145/3238147.3240474}, keywords = {Extreme mutation, Mutation testing, PITest, Pseudo-tested methods, Software testing, Engines, Open source software, Mutation operators, Mutation score, Open source projects, Tool demonstration}, abstract = {Descartes is a tool that implements extreme mutation operators and aims at finding pseudo-tested methods in Java projects. It leverages the efficient transformation and runtime features of PITest. The demonstration compares Descartes with Gregor, the default mutation engine provided by PITest, in a set of real open source projects. It considers the execution time, number of mutants created and the relationship between the mutation scores produced by both engines. It provides some insights on the main features exposed by Descartes. }, URL = {http://www.ase2018.com/}, year = {2018} } @inproceedings{Liu2000817, author = {Liu, Raphina and Bobadilla, Sofia and Baudry, Benoit and Monperrus, Martin}, booktitle = {FSE Companion 2025 - Companion Proceedings of the 33rd ACM International Conference on the Foundations of Software Engineering : }, institution = {KTH}, institution = {KTH, Theoretical Computer Science, TCS}, note = {Part of ISBN 9798400712760QC 20250925}, pages = {1045--1049}, publisher = {Association for Computing Machinery (ACM)}, title = {Dirty-Waters: Detecting Software Supply Chain Smells}, DOI = {10.1145/3696630.3728578}, keywords = {Open Source, Software Security, Software Supply Chain}, abstract = {Using open-source dependencies is essential in modern software development. However, this practice implies significant trust in third-party code, while there is little support for developers to assess this trust. As a consequence, attacks, called software supply chain attacks, have been increasingly occurring through third-party dependencies. In this paper, we target the problem of projects that use dependencies, where developers are unaware of the potential risks posed by their software supply chain. We define the novel concept of software supply chain smell and present Dirty-Waters, a novel tool for detecting software supply chain smells. We evaluate Dirty-Waters on three JavaScript projects and demonstrate the prevalence of all proposed software supply chain smells. Dirty-Waters reveals potential risks for previously invisible problems and provides clear indicators for developers to act on the security of their supply chain. A video demonstrating Dirty-Waters is available at: http://l.4open.science/dirty-waters-demo. }, year = {2025} } @inproceedings{Sobreira1243077, author = {Sobreira, Victor and Durieux, Thomas and Madeiral, Fernanda and Monperrus, Martin and De Almeida Maia, Marcelo}, booktitle = {25th IEEE International Conference on Software Analysis, Evolution and Reengineering, SANER 2018 - Proceedings : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, institution = {INRIA, Rocquencourt, France.;Univ Lille, Lille, France.}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, institution = {Univ Fed Uberlandia, Uberlandia, MG, Brazil.}, note = {Part of proceedingsg: ISBN 978-1-5386-4969-5QC 20180830}, pages = {130--140}, title = {Dissection of a bug dataset : Anatomy of 395 patches from Defects4J}, DOI = {10.1109/SANER.2018.8330203}, abstract = {Well-designed and publicly available datasets of bugs are an invaluable asset to advance research fields such as fault localization and program repair as they allow directly and fairly comparison between competing techniques and also the replication of experiments. These datasets need to be deeply understood by researchers: The answer for questions like 'which bugs can my technique handle?' and 'for which bugs is my technique effective?' depends on the comprehension of properties related to bugs and their patches. However, such properties are usually not included in the datasets, and there is still no widely adopted methodology for characterizing bugs and patches. In this work, we deeply study 395 patches of the Defects4J dataset. Quantitative properties (patch size and spreading) were automatically extracted, whereas qualitative ones (repair actions and patterns) were manually extracted using a thematic analysis-based approach. We found that 1) the median size of Defects4J patches is four lines, and almost 30% of the patches contain only addition of lines; 2) 92% of the patches change only one file, and 38% has no spreading at all; 3) the top-3 most applied repair actions are addition of method calls, conditionals, and assignments, occurring in 77% of the patches; and 4) nine repair patterns were found for 95% of the patches, where the most prevalent, appearing in 43% of the patches, is on conditional blocks. These results are useful for researchers to perform advanced analysis on their techniques' results based on Defects4J. Moreover, our set of properties can be used to characterize and compare different bug datasets. }, year = {2018} } @article{Bobadilla2047171, author = {Bobadilla, Sofia and Jin, Monica and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20260319}, number = {1}, pages = {100--115}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Do Automated Fixes Truly Mitigate Smart Contract Exploits?}, volume = {52}, DOI = {10.1109/TSE.2025.3618123}, keywords = {Smart contracts, Maintenance engineering, Codes, Source coding, Blockchains, Prevention and mitigation, Manuals, Static analysis, Systematic literature review, Formal verification}, abstract = {Automated Program Repair (APR) for smart contract security promises to automatically mitigate smart contract vulnerabilities responsible for billions in financial losses. However, the true effectiveness of this research in addressing smart contract exploits remains uncharted territory. This paper bridges this critical gap by introducing a novel and systematic experimental framework for evaluating exploit mitigation of program repair tools for smart contracts. We qualitatively and quantitatively analyze 20 state-of-the-art APR tools using a dataset of 143 vulnerable smart contracts, for which we manually craft 91 executable exploits. We are the very first to define and measure the essential "exploit mitigation rate", giving researchers and practitioners a real sense of effectiveness. Our findings reveal substantial disparities in the state of the art, with an exploit mitigation rate ranging from a low of 29% to a high of 74%. Our study identifies systemic limitations, such as inconsistent functionality preservation, that must be addressed in future research on program repair for smart contracts. }, year = {2026} } @misc{Baudry1639160, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20220314}, title = {Dynamic Analysis in the Browser}, URL = {https://cacm.acm.org/blogs/blog-cacm/239266-dynamic-analysis-in-the-browser/fulltext}, year = {2019} } @article{Monperrus1240389, author = {Monperrus, Martin and Weimer, Westley}, institution = {KTH}, institution = {Univ Michigan, Ann Arbor, MI 48109 USA.}, journal = {Empirical Software Engineering}, note = {QC 20180821}, number = {5}, pages = {2865--2865}, title = {Editor's Note : Special Issue on Automatic Software Repair}, volume = {23}, DOI = {10.1007/s10664-018-9632-7}, year = {2018} } @article{Etemadi1658845, author = {Etemadi, Khashayar and Tarighat, Niloofar and Yadav, Siddharth and Martinez, Matias and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Sharif Univ Technol, Tehran, Iran.}, institution = {Indraprastha Inst Informat Technol Delhi, Delhi, India.}, institution = {Univ Polytech Hauts De France, Valenciennes, France.}, journal = {Journal of Systems and Software}, note = {QC 20220817}, eid = {111263}, publisher = {Elsevier BV}, title = {Estimating the potential of program repair search spaces with commit analysis}, volume = {188}, DOI = {10.1016/j.jss.2022.111263}, keywords = {Program repair, Search-space, Static code analysis, Commit analysis}, abstract = {The most natural method for evaluating program repair systems is to run them on bug datasets, such as Defects4J. Yet, using this evaluation technique on arbitrary real-world programs requires heavy configuration. In this paper, we propose a purely static method to evaluate the potential of the search space of repair approaches. This new method enables researchers and practitioners to encode the search spaces of repair approaches and select potentially useful ones without struggling with tool configuration and execution. We encode the search spaces by specifying the repair strategies they employ. Next, we use the specifications to check whether past commits lie in repair search spaces. For a repair approach, including many human-written past commits in its search space indicates its potential to generate useful patches. We implement our evaluation method in LIGHTER. LIGHTER gets a Git repository and outputs a list of commits whose source code changes lie in repair search spaces. We run LIGHTER on 55,309 commits from the history of 72 Github repositories with and show that LIGHTER's precision and recall are 77% and 92%, respectively. Overall, our experiments show that our novel method is both lightweight and effective to study the search space of program repair approaches. }, year = {2022} } @inproceedings{Durieux1245426, author = {Durieux, Thomas and Hamadi, Youssef and Yu, Zhongxing and Baudry, Benoit and Monperrus, Martin}, booktitle = {2018 IEEE 11TH INTERNATIONAL CONFERENCE ON SOFTWARE TESTING, VERIFICATION AND VALIDATION (ICST) : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lille, Lille, France.;INRIA, Le Chesnay, France.}, institution = {Ecole Polytech, Palaiseau, France.}, institution = {Univ Lille, Lille, France.;INRIA, Le Chesnay, France.}, note = {QC 20180905}, pages = {139--149}, title = {Exhaustive Exploration of the Failure-oblivious Computing Search Space}, series = {IEEE International Conference on Software Testing Verification and Validation}, DOI = {10.1109/ICST.2018.00023}, abstract = {High-availability of software systems requires automated handling of crashes in presence of errors. Failure-oblivious computing is one technique that aims to achieve high availability. We note that failure-obliviousness has not been studied in depth yet, and there is very few study that helps understand why failure-oblivious techniques work. In order to make failure-oblivious computing to have an impact in practice, we need to deeply understand failure-oblivious behaviors in software. In this paper, we study, design and perform an experiment that analyzes the size and the diversity of the failure-oblivious behaviors. Our experiment consists of exhaustively computing the search space of 16 field failures of large-scale open-source Java software. The outcome of this experiment is a much better understanding of what really happens when failure-oblivious computing is used, and this opens new promising research directions. }, ISBN = {978-1-5386-5012-7}, year = {2018} } @inproceedings{Monperrus1417230, author = {Monperrus, Martin}, booktitle = {Proceedings - 2019 IEEE/ACM 1st International Workshop on Bots in Software Engineering, BotSE 2019 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200327}, pages = {12--15}, title = {Explainable software bot contributions : Case study of automated bug fixes}, DOI = {10.1109/BotSE.2019.00010}, abstract = {In a software project, esp. in open-source, a contribution is a valuable piece of work made to the project: writing code, reporting bugs, translating, improving documentation, creating graphics, etc. We are now at the beginning of an exciting era where software bots will make contributions that are of similar nature than those by humans. Dry contributions, with no explanation, are often ignored or rejected, because the contribution is not understandable per se, because they are not put into a larger context, because they are not grounded on idioms shared by the core community of developers. We have been operating a program repair bot called Repairnator for 2 years and noticed the problem of "dry patches": a patch that does not say which bug it fixes, or that does not explain the effects of the patch on the system. We envision program repair systems that produce an "explainable bug fix": an integrated package of at least 1) a patch, 2) its explanation in natural or controlled language, and 3) a highlight of the behavioral difference with examples. In this paper, we generalize and suggest that software bot contributions must explainable, that they must be put into the context of the global software development conversation. }, URL = {https://ieeexplore.ieee.org/document/8823632}, year = {2019} } @article{Koyuncu1435645, author = {Koyuncu, Anil and Liu, Kui and Bissyande, Tegawende F. and Kim, Dongsun and Klein, Jacques and Monperrus, Martin and Le Traon, Yves}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Furiosa Ai, 145 Dosan Daero, Seoul, South Korea.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, SnT, Luxembourg, Luxembourg.}, journal = {Empirical Software Engineering}, note = {QC 20200605}, number = {3}, pages = {1980--2024}, publisher = {Springer}, title = {FixMiner : Mining relevant fix patterns for automated program repair}, volume = {25}, DOI = {10.1007/s10664-019-09780-z}, keywords = {Fix patterns, Patches, Program repair, Debugging, Empirical software engineering}, abstract = {Patching is a common activity in software development. It is generally performed on a source code base to address bugs or add new functionalities. In this context, given the recurrence of bugs across projects, the associated similar patches can be leveraged to extract generic fix actions. While the literature includes various approaches leveraging similarity among patches to guide program repair, these approaches often do not yield fix patterns that are tractable and reusable as actionable input to APR systems. In this paper, we propose a systematic and automated approach to mining relevant and actionable fix patterns based on an iterative clustering strategy applied to atomic changes within patches. The goal of FixMiner is thus to infer separate and reusable fix patterns that can be leveraged in other patch generation systems. Our technique, FixMiner, leverages Rich Edit Script which is a specialized tree structure of the edit scripts that captures the AST-level context of the code changes. FixMiner uses different tree representations of Rich Edit Scripts for each round of clustering to identify similar changes. These are abstract syntax trees, edit actions trees, and code context trees. We have evaluated FixMiner on thousands of software patches collected from open source projects. Preliminary results show that we are able to mine accurate patterns, efficiently exploiting change information in Rich Edit Scripts. We further integrated the mined patterns to an automated program repair prototype, PAR(FixMiner), with which we are able to correctly fix 26 bugs of the Defects4J benchmark. Beyond this quantitative performance, we show that the mined fix patterns are sufficiently relevant to produce patches with a high probability of correctness: 81% of PAR(FixMiner)'s generated plausible patches are correct. }, year = {2020} } @inproceedings{Tan1651818, author = {Tan, Shin Hwei and Mechtaev, Sergey and Zhang, Lingming and Monperrus, Martin}, booktitle = {Proceedings - 2021 IEEE/ACM International Workshop on Automated Program Repair, APR 2021 : APR 2021}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Southern University of Science and Technology, China}, institution = {University College, London, United Kingdom}, institution = {University of Illinois, Urbana-Champaign, United States}, note = {Part of proceedings: ISBN 978-1-6654-4472-9QC 20220413}, eid = {9474539}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Foreword}, DOI = {10.1109/APR52552.2021.00005}, abstract = {Presents the introductory welcome message from the conference proceedings. May include the conference officers' congratulations to all involved with the conference event and publication of the proceedings record. }, year = {2021} } @article{Durieux1416641, author = {Durieux, Thomas and Hamadi, Youssef and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lisbon, INESC ID, Lisbon, Portugal.;Univ Lisbon, IST, Lisbon, Portugal.}, institution = {Uber Elevate Res, Paris, France.}, journal = {Software testing, verification & reliability}, note = {QC 20200324}, number = {2}, eid = {e1731}, publisher = {WILEY}, title = {Fully Automated HTML and JavaScript Rewriting for Constructing a Self-healing Web Proxy}, volume = {30}, DOI = {10.1002/stvr.1731}, keywords = {self-healing, bugs, JavaScript, proxy, chrome extension}, abstract = {Over the last few years, the complexity of web applications has increased to provide more dynamic web applications to users. The drawback of this complexity is the growing number of errors in the front-end applications. In this paper, we present an approach to provide self-healing for the web. We implemented this approach in two different tools: (i) BikiniProxy, an HTTP repair proxy, and (ii) BugBlock, a browser extension. They use five self-healing strategies to rewrite the buggy HTML and JavaScript code to handle errors in web pages. We evaluate BikiniProxy and BugBlock with a new benchmark of 555 reproducible JavaScript errors of which 31.76% can be automatically self-healed by BikiniProxy and 15.67% by BugBlock. }, year = {2020} } @inproceedings{Durieux1319732, author = {Durieux, Thomas and Hamadi, Youssef and Monperrus, Martin}, booktitle = {2018 29TH IEEE INTERNATIONAL SYMPOSIUM ON SOFTWARE RELIABILITY ENGINEERING (ISSRE) : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {INRIA, Lille, France.;Univ Lille, Lille, France.}, institution = {Ecole Polytech, Paris, France.}, note = {QC 20190603}, pages = {1--12}, publisher = {IEEE}, title = {Fully Automated HTML and Javascript Rewriting for Constructing a Self-healing Web Proxy}, series = {Proceedings International Symposium on Software Reliability Engineering}, DOI = {10.1109/ISSRE.2018.00012}, keywords = {}, abstract = {Over the last few years, the complexity of web applications has increased to provide more dynamic web applications to users. The drawback of this complexity is the growing number of errors in the front-end applications. In this paper, we present BikiniProxy, a novel technique to provide self-healing for the web. BikiniProxy is designed as an HTTP proxy that uses five self-healing strategies to rewrite the buggy HTML and Javascript code. We evaluate BikiniProxy with a new benchmark of 555 reproducible Javascript errors of which 31.76% can be automatically self-healed. }, ISBN = {978-1-5386-8321-7}, year = {2018} } @article{Baudry1909148, author = {Baudry, Benoit and Etemadi, Khashayar and Fang, Sen and Gamage, Yogya and Liu, Yi and Liu, Yuxin and Monperrus, Martin and Ron Arteaga, Javier and Silva, Andre and Tiwari, Deepika}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Univ Montreal, Software Technol, Montreal, PQ H3T 1J4, Canada.}, institution = {North Carolina State Univ, Raleigh, NC 27606 USA.}, journal = {IEEE Software}, note = {QC 20241030}, number = {6}, pages = {55--64}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Generative AI to Generate Test Data Generators}, volume = {41}, DOI = {10.1109/MS.2024.3418570}, keywords = {Generators, Cultural differences, Testing, Libraries, Java, Codes, Vectors}, abstract = {High quality data is essential for designing effective software test suites. We propose three original methods for using large language models to generate representative test data, which fit to the domain of the program under test and are culturally adequate. }, year = {2024} } @inproceedings{Saavedra1869243, author = {Saavedra, Nuno and Silva, Andr{\’e;} and Monperrus, Martin}, booktitle = {Proceedings - 2024 ACM/IEEE 46th International Conference on Software Engineering: Companion, ICSE-Companion 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {INESC-ID/IST, University of Lisbon, Lisbon, Portugal}, note = { Part of ISBN 979-840070502-1QC 20240613}, pages = {1--5}, publisher = {Association for Computing Machinery (ACM)}, title = {GitBug-Actions: Building Reproducible Bug-Fix Benchmarks with GitHub Actions}, series = {Proceedings - International Conference on Software Engineering}, DOI = {10.1145/3639478.3640023}, keywords = {Bug Benchmark, Bug Database, GitHub Actions, Program Analysis, Reproducibility, Software Bugs, Software Testing}, abstract = {Bug-fix benchmarks are fundamental in advancing various subfields of software engineering such as automatic program repair (APR) and fault localization (FL). A good benchmark must include recent examples that accurately reflect t echnologies a nd development practices of today. To be executable in the long term, a benchmark must feature test suites that do not degrade overtime due to, for example, dependencies that are no longer available. Existing benchmarks fail in meeting both criteria. For instance, Defects4J, one of the foremost Java benchmarks, last received an update in 2020. Moreover, full-reproducibility has been neglected by the majority of existing benchmarks. In this paper, we present GitBug-Actions: a novel tool for building bug-fix benchmarks with modern and fully-reproducible bug-fixes. GitBug- Actions relies on the most popular CI platform, GitHub Actions, to detect bug-fixes a nd s martly l ocally e xecute t he CI pipeline in a controlled and reproducible environment. To the best of our knowledge, we are the first t o r ely o n G itHub Actions t o collect bug-fixes. To demonstrate our toolchain, we deploy GitBug- Actions to build a proof-of-concept Go bug-fix benchmark containing executable, fully-reproducible bug-fixes from different repositories. A video demonstrating GitBug-Actions is available at: https://youtu.be/aBWwa1sJYBs. }, year = {2024} } @inproceedings{Silva1894742, author = {Silva, Andr{\’e;} and Saavedra, Nuno and Monperrus, Martin}, booktitle = {2024 IEEE/ACM 21St International Conference On Mining Software Repositories, Msr : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Lisbon, INESC ID IST, Lisbon, Portugal.}, note = {Part of ISBN 979-8-3503-6398-2, 979-8-4007-0587-8QC 20240903}, pages = {118--122}, publisher = {Association for Computing Machinery (ACM)}, title = {GitBug-Java : A Reproducible Benchmark of Recent Java Bugs}, series = {IEEE International Working Conference on Mining Software Repositories}, DOI = {10.1145/3643991.3644884}, keywords = {Software Bugs, Bug Benchmark, Reproducibility, Bug Database, Java Benchmark, Software Testing, Program Analysis}, abstract = {Bug-fix benchmarks are essential for evaluating methodologies in automatic program repair (APR) and fault localization (FL). However, existing benchmarks, exemplified by Defects4J, need to evolve to incorporate recent bug-fixes aligned with contemporary development practices. Moreover, reproducibility, a key scientific principle, has been lacking in bug-fix benchmarks. To address these gaps, we present GitBug-Java, a reproducible benchmark of recent Java bugs. GitBug-Java features 199 bugs extracted from the 2023 commit history of 55 notable open-source repositories. The methodology for building GitBug-Java ensures the preservation of bug-fixes in fully-reproducible environments. We publish GitBug-Java at https://github.com/gitbugactions/gitbug- java. }, year = {2024} } @inproceedings{Cesarano1927857, author = {Cesarano, Carmine and Andersson, Vivi and Natella, Roberto and Monperrus, Martin}, booktitle = {SCORED 2024 - Proceedings of the 2024 Workshop on Software Supply Chain Offensive Research and Ecosystem Defenses, Co-Located with: CCS 2024 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Università degli Studi di Napoli Federico II Naples, Italy}, institution = {Università degli Studi di Napoli Federico II Naples, Italy}, note = {Part of ISBN 979-840071240-1QC 20250117}, pages = {33--42}, publisher = {Association for Computing Machinery (ACM)}, title = {GoSurf: Identifying Software Supply Chain Attack Vectors in Go}, DOI = {10.1145/3689944.3696166}, keywords = {Golang, Open-Source Security, Supply Chain Attacks}, abstract = {In Go, the widespread adoption of open-source software has led to a flourishing ecosystem of third-party dependencies, which are often integrated into critical systems. However, the reuse of dependencies introduces significant supply chain security risks, as a single compromised package can have cascading impacts. Existing supply chain attack taxonomies overlook language-specific features that can be exploited by attackers to hide malicious code. In this paper, we propose a novel taxonomy of 12 distinct attack vectors tailored for the Go language and its package lifecycle. Our taxonomy identifies patterns in which language-specific Go features, intended for benign purposes, can be misused to propagate malicious code stealthily through supply chains. Additionally, we introduce GoSurf, a static analysis tool that analyzes the attack surface of Go packages according to our proposed taxonomy. We evaluate GoSurf on a corpus of 500 widely used, real-world Go packages. Our work provides preliminary insights for securing the open-source software supply chain within the Go ecosystem, allowing developers and security analysts to prioritize code audit efforts and uncover hidden malicious behaviors. }, year = {2024} } @inproceedings{Zetterlund1697096, author = {Zetterlund, Louise and Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, booktitle = {2022 IEEE Conference on Software Testing, Verification and Validation (ICST 2022) : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Redeye AB, Stockholm, Sweden.}, note = {Part of proceedings: ISBN 978-1-6654-6679-0, QC 20220920}, pages = {365--376}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Harvesting Production GraphQL Queries to Detect Schema Faults}, series = {IEEE International Conference on Software Testing Verification and Validation}, DOI = {10.1109/ICST53961.2022.00014}, keywords = {GraphQL, production monitoring, automated test generation, test oracle, API testing, schema}, abstract = {GraphQL is a new paradigm to design web APIs. Despite its growing popularity, there are few techniques to verify the implementation of a GraphQL, API. We present a new testing approach based on GraphQL queries that are logged while users interact with an application in production. Our core motivation is that production queries capture real usages of the application, and are known to trigger behavior that may not be tested by developers. For each logged query, a test is generated to assert the validity of the GraphQL response with respect to the schema. We implement our approach in a tool called AutoGraphQL, and evaluate it on two real-world case studies that are diverse in their domain and technology stack: an open-source e-commerce application implemented in Python called Saleor, and an industrial case study which is a PHP-based finance website called Frontapp. AutoGraphQL successfully generates test cases for the two applications. The generated tests cover 26.9% of the Saleor schema, including parts of the API not exercised by the original test suite, as well as 48.7% of the Frontapp schema, detecting 8 schema faults, thanks to production queries. }, year = {2022} } @article{RonArteaga1881983, author = {Ron Arteaga, Javier and Soto Valero, C{\’e;}sar and Zhang, Long and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, journal = {IEEE Transactions on Dependable and Secure Computing}, note = {QC 20240704}, number = {4}, pages = {4084--4097}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Highly Available Blockchain Nodes With N-Version Design}, volume = {21}, DOI = {10.1109/TDSC.2023.3346195}, keywords = {availability, blockchain, Blockchains, Computer architecture, N-Version design, Peer-to-peer computing, Programming, Prototypes, Software, Time factors}, abstract = {As all software, blockchain nodes are exposed to faults in their underlying execution stack. Unstable execution environments can disrupt the availability of blockchain nodes' interfaces, resulting in downtime for users. This paper introduces the concept of N-Version Blockchain nodes. This new type of node relies on simultaneous execution of different implementations of the same blockchain protocol, in the line of Avizienis' N-Version programming vision. We design and implement an N-Version blockchain node prototype in the context of Ethereum, called N-ETH. We show that N-ETH is able to mitigate the effects of unstable execution environments and significantly enhance availability under environment faults. To simulate unstable execution environments, we perform fault injection at the system-call level. Our results show that existing Ethereum node implementations behave asymmetrically under identical instability scenarios. N-ETH leverages this asymmetric behavior available in the diverse implementations of Ethereum nodes to provide increased availability, even under our most aggressive fault-injection strategies. We are the first to validate the relevance of N-Version design in the domain of blockchain infrastructure. From an industrial perspective, our results are of utmost importance for businesses operating blockchain nodes, including Google, ConsenSys, and many other major blockchain companies. }, year = {2024} } @inproceedings{Urli1264330, author = {Urli, S. and Yu, Z. and Seinturier, L. and Monperrus, Martin}, booktitle = {Proceeding ICSE-SEIP '18 Proceedings of the 40th International Conference on Software Engineering: Software Engineering in Practice : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20181120}, pages = {95--104}, title = {How to design a program repair bot? : Insights from the repairnator project}, series = {Proceedings - International Conference on Software Engineering}, DOI = {10.1145/3183519.3183540}, abstract = {Program repair research has made tremendous progress over the last few years, and software development bots are now being invented to help developers gain productivity. In this paper, we investigate the concept of a "program repair bot" and present Repairnator. The Repairnator bot is an autonomous agent that constantly monitors test failures, reproduces bugs, and runs program repair tools against each reproduced bug. If a patch is found, Repairnator bot reports it to the developers. At the time of writing, Repairnator uses three different program repair systems and has been operating since February 2017. In total, it has studied 11 523 test failures over 1 609 open-source software projects hosted on GitHub, and has generated patches for 15 different bugs. Over months, we hit a number of hard technical challenges and had to make various design and engineering decisions. This gives us a unique experience in this area. In this paper, we reflect upon Repairnator in order to share this knowledge with the automatic program repair community. }, ISBN = {9781450356596}, year = {2018} } @techreport{Monperrus1753115, author = {Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20230426}, title = {How to make a good open-science repository?}, URL = {https://communities.springernature.com/posts/how-to-make-a-good-open-science-repository}, year = {2019} } @article{Borg1781999, author = {Borg, Markus and Aasa, Emil and Etemadi, Khashayar and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {CodeScene, Principal Researcher, Malmö, Sweden, 21 532; Lund University, Adjunct Lecturer, Lund, Sweden}, institution = {CodeScene, Senior Developer, Luleå, Sweden}, journal = {IEEE Software}, note = {QC 20230711}, number = {3}, pages = {9--14}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Human, What Must I Tell You?}, volume = {40}, DOI = {10.1109/MS.2023.3244638}, abstract = {Artificial intelligence (AI)-assisted code generation is everywhere these days. Undoubtedly, AI will help near-future developers substantially by providing code suggestions and automation. In this application, explainability will be a key quality attribute. But what needs to be explained to whom? And how to deliver the explanations nonintrusively? }, year = {2023} } @article{Baudry1954117, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {ACM Inroads}, note = {QC 20250424}, title = {Humor for graduate training}, DOI = {10.1145/3730408}, keywords = {humor; higher education}, abstract = {Humor genuinely engages graduate students with their scientific training. }, year = {2025} } @article{Martinez1811789, author = {Martinez, Matias and Falleri, Jean Remy and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Universitat Politècnica de Catalunya, CP, Barcelona, Spain, 08034, CP}, institution = {CNRS, Bordeaux INP, LaBRI, Univ. Bordeaux, Talence, France, CP F-33400, Talence; Institut Universitaire de France, France}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20231114}, number = {10}, pages = {4814--4828}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Hyperparameter Optimization for AST Differencing}, volume = {49}, DOI = {10.1109/TSE.2023.3315935}, keywords = {Abstract Syntax Trees (AST), hyperparameter optimization, edit-script, Software evolution, Tree differencing}, abstract = {Computing the differences between two versions of the same program is an essential task for software development and software evolution research. AST differencing is the most advanced way of doing so, and an active research area. Yet, AST differencing algorithms rely on configuration parameters that may have a strong impact on their effectiveness. In this paper, we present a novel approach named DAT (D iff Auto Tuning) for hyperparameter optimization of AST differencing. We thoroughly state the problem of hyper-configuration for AST differencing. We evaluate our data-driven approach DAT to optimize the edit-scripts generated by the state-of-the-art AST differencing algorithm named GumTree in different scenarios. DAT is able to find a new configuration for GumTree that improves the edit-scripts in 21.8% of the evaluated cases. }, year = {2023} } @inproceedings{Koyuncu1358069, author = {Koyuncu, Anil and Liu, Kui and Bissyande, Tegawende F. and Kim, Dongsun and Monperrus, Martin and Klein, Jacques and Le Traon, Yves}, booktitle = {ESEC/FSE'2019 : PROCEEDINGS OF THE 2019 27TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.;Furiosa Ai, Seoul, South Korea.}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.}, institution = {Univ Luxembourg, Luxembourg, Luxembourg.}, note = {QC 20191007}, pages = {314--325}, publisher = {ASSOC COMPUTING MACHINERY}, title = {iFixR : Bug Report driven Program Repair}, DOI = {10.1145/3338906.3338935}, keywords = {Information retrieval, fault localization, automatic patch generation}, abstract = {Issue tracking systems are commonly used in modern software development for collecting feedback from users and developers. An ultimate automation target of software maintenance is then the systematization of patch generation for user-reported bugs. Although this ambition is aligned with the momentum of automated program repair, the literature has, so far, mostly focused on generate-and-validate setups where fault localization and patch generation are driven by a well-defined test suite. On the one hand, however, the common (yet strong) assumption on the existence of relevant test cases does not hold in practice for most development settings: many bugs are reported without the available test suite being able to reveal them. On the other hand, for many projects, the number of bug reports generally outstrips the resources available to triage them. Towards increasing the adoption of patch generation tools by practitioners, we investigate a new repair pipeline, iFixR, driven by bug reports: (1) bug reports are fed to an IR-based fault localizer; (2) patches are generated from fix patterns and validated via regression testing; (3) a prioritized list of generated patches is proposed to developers. We evaluate iFixR on the Defects4J dataset, which we enriched (i.e., faults are linked to bug reports) and carefully-reorganized (i.e., the timeline of test-cases is naturally split). iFixR generates genuine/plausible patches for 21/44 Defects4J faults with its IR-based fault localizer. iFixR accurately places a genuine/plausible patch among its top-5 recommendation for 8/13 of these faults (without using future test cases in generation-and-validation). }, year = {2019} } @article{Muntean1422639, author = {Muntean, P. and Monperrus, Martin and Sun, H. and Grossklags, J. and Eckert, C.}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20211112}, number = {10}, pages = {2225--2241}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {IntRepair : Informed Repairing of Integer Overflows}, volume = {47}, DOI = {10.1109/TSE.2019.2946148}, keywords = {Engines, Fault detection, Fuses, integer overflow, Maintenance engineering, Program repair, Runtime, Software, software bug, source code refactoring, static program analysis, symbolic execution, Tools, Application programs, Computer software, Electric fuses, Integer programming, Maintainability, Model checking, Repair, Runtimes, Source codes, C (programming language)}, abstract = {Integer overflows have threatened software applications for decades. Thus, in this paper, we propose a novel technique to provide automatic repairs of integer overflows in C source code. Our technique, based on static symbolic execution, fuses detection, repair generation and validation. This technique is implemented in a prototype named IntRepair. We applied IntRepair to 2,052 C programs (approx. 1 million lines of code) contained in SAMATE's Juliet test suite and 50 synthesized programs that range up to 20 KLOC. Our experimental results show that IntRepair is able to effectively detect integer overflows and successfully repair them, while only increasing the source code (LOC) and binary (Kb) size by around 1%, respectively. Furthermore, we present the results of a user study with 30 participants showing that IntRepair repairs are more efficient than manual repairs.  }, year = {2021} } @inproceedings{Ye1841394, author = {Ye, He and Monperrus, Martin}, booktitle = {ICSE 2024 - Proceedings of the 46th IEEE/ACM International Conference on Software Engineering : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Carnegie Mellon University United States}, note = {Part of proceedings ISBN 9798400702174QC 20240305}, publisher = {Association for Computing Machinery (ACM)}, title = {ITER : Iterative Neural Repair for Multi-Location Patches}, DOI = {10.1145/3597503.3623337}, abstract = {Automated program repair (APR) has achieved promising results, especially using neural networks. Yet, the overwhelming majority of patches produced by APR tools are confined to one single location. When looking at the patches produced with neural repair, most of them fail to compile, while a few uncompilable ones go in the right direction. In both cases, the fundamental problem is to ignore the potential of partial patches. In this paper, we propose an iterative program repair paradigm called ITER founded on the concept of improving partial patches until they become plausible and correct. First, ITER iteratively improves partial single-location patches by fixing compilation errors and further refining the previously generated code. Second, ITER iteratively improves partial patches to construct multi-location patches, with fault localization re-execution. ITER is implemented for Java based on battle-proven deep neural networks and code representation. ITER is evaluated on 476 bugs from 10 open-source projects in Defects4J 2.0. ITER succeeds in repairing 15.5% of them, including 9 uniquely repaired multi-location bugs. }, year = {2024} } @article{Harrand1467385, author = {Harrand, Nicolas and Soto Valero, C{\’e;}sar and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Journal of Systems and Software}, note = {QC 20210602}, eid = {110645}, publisher = {Elsevier BV}, title = {Java decompiler diversity and its application to meta-decompilation}, volume = {168}, DOI = {10.1016/j.jss.2020.110645}, keywords = {Java bytecode, Decompilation, Reverse engineering, Source code analysis}, abstract = {During compilation from Java source code to bytecode, some information is irreversibly lost. In other words, compilation and decompilation of Java code is not symmetric. Consequently, decompilation, which aims at producing source code from bytecode, relies on strategies to reconstruct the information that has been lost. Different Java decompilers use distinct strategies to achieve proper decompilation. In this work, we hypothesize that the diverse ways in which bytecode can be decompiled has a direct impact on the quality of the source code produced by decompilers. In this paper, we assess the strategies of eight Java decompilers with respect to three quality indicators: syntactic correctness, syntactic distortion and semantic equivalence modulo inputs. Our results show that no single modern decompiler is able to correctly handle the variety of bytecode structures coming from real-world programs. The highest ranking decompiler in this study produces syntactically correct, and semantically equivalent code output for 84%, respectively 78%, of the classes in our dataset. Our results demonstrate that each decompiler correctly handles a different set of bytecode classes. We propose a new decompiler called Arlecchino that leverages the diversity of existing decompilers. To do so, we merge partial decompilation into a new one based on compilation errors. Arlecchino handles 37.6% of bytecode classes that were previously handled by no decompiler. We publish the sources of this new bytecode decompiler. (C) 2020 Published by Elsevier Inc. }, year = {2020} } @article{Yu1791105, author = {Yu, Zhongxing and Martinez, Matias and Chen, Zimin and Bissyande, Tegawende F. F. and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Shandong Univ, Jinan, Peoples R China.}, institution = {Univ Politecn Cataluna, Barcelona, Spain.}, institution = {Univ Luxembourg, L-4365 Esch Sur Alzette, Luxembourg.}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20231127}, number = {7}, pages = {3872--3900}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Learning the Relation Between Code Features and Code Transforms With Structured Prediction}, volume = {49}, DOI = {10.1109/TSE.2023.3275380}, keywords = {Code transform, big code, machine learning, program repair}, abstract = {To effectively guide the exploration of the code transform space for automated code evolution techniques, we present in this article the first approach for structurally predicting code transforms at the level of AST nodes using conditional random fields (CRFs). Our approach first learns offline a probabilistic model that captures how certain code transforms are applied to certain AST nodes, and then uses the learned model to predict transforms for arbitrary new, unseen code snippets. Our approach involves a novel representation of both programs and code transforms. Specifically, we introduce the formal framework for defining the so-called AST-level code transforms and we demonstrate how the CRF model can be accordingly designed, learned, and used for prediction. We instantiate our approach in the context of repair transform prediction for Java programs. Our instantiation contains a set of carefully designed code features, deals with the training data imbalance issue, and comprises transform constraints that are specific to code. We conduct a large-scale experimental evaluation based on a dataset of bug fixing commits from real-world Java projects. The results show that when the popular evaluation metric top-3 is used, our approach predicts the code transforms with an accuracy varying from 41% to 53% depending on the transforms. Our model outperforms two baselines based on history probability and neural machine translation (NMT), suggesting the importance of considering code structure in achieving good prediction accuracy. In addition, a proof-of-concept synthesizer is implemented to concretize some repair transforms to get the final patches. The evaluation of the synthesizer on the Defects4j benchmark confirms the usefulness of the predicted AST-level repair transforms in producing high-quality patches. }, year = {2023} } @article{Baudry1748707, author = {Baudry, Benoit and Toady, Tim and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Queue}, note = {QC 20230404}, number = {2}, pages = {31--42}, publisher = {Association for Computing Machinery (ACM)}, title = {Long Live Software Easter Eggs!}, volume = {20}, DOI = {10.1145/3534857}, keywords = {HTTP Header, Tooltips, HTTP}, abstract = {It's a period of unrest. Rebel developers, striking from continuous deployment servers, have won their first victory. During the battle, rebel spies managed to push an epic commit in the HTML code of https://pro.sony. Pursued by sinister agents, the rebels are hiding in commits, buttons, tooltips, API, HTTP headers, and configuration screens.  }, year = {2022} } @article{Zhang1437442, author = {Zhang, Long and Morin, Brice and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Tellu, Asker, Norway}, journal = {IEEE Transactions on Dependable and Secure Computing}, note = {QC 20250327}, number = {4}, pages = {2695--2708}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Maximizing Error Injection Realism for Chaos Engineering with System Calls}, volume = {19}, DOI = {10.1109/TDSC.2021.3069715}, keywords = {fault injection, system call, chaos engineering}, abstract = {In this paper, we present a novel fault injection framework for system call invocation errors, called Phoebe. Phoebe is unique as follows; First, Phoebe enables developers to have full observability of system call invocations. Second, Phoebe generates error models that are realistic in the sense that they mimic errors that naturally happen in production. Third, Phoebe is able to automatically conduct experiments to systematically assess the reliability of applications with respect to system call invocation errors in production. We evaluate the effectiveness and runtime overhead of Phoebe on two real-world applications in a production environment. The results show that Phoebe successfully generates realistic error models and is able to detect important reliability weaknesses with respect to system call invocation errors. To our knowledge, this novel concept of "realistic error injection", which consists of grounding fault injection on production errors, has never been studied before. }, year = {2022} } @article{Tiwari1911846, author = {Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250120}, number = {11}, pages = {2921--2946}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Mimicking Production Behavior with Generated Mocks}, volume = {50}, DOI = {10.1109/tse.2024.3458448}, abstract = {Mocking allows testing program units in isolation. A developer who writes tests with mocks faces two challenges: design realistic interactions between a unit and its environment; and understand the expected impact of these interactions on the behavior of the unit. In this paper, we propose to monitor an application in production to generate tests that mimic realistic execution scenarios through mocks. Our approach operates in three phases. First, we instrument a set of target methods for which we want to generate tests, as well as the methods that they invoke, which we refer to as mockable method calls. Second, in production, we collect data about the context in which target methods are invoked, as well as the parameters and the returned value for each mockable method call. Third, offline, we analyze the production data to generate test cases with realistic inputs and mock interactions. The approach is automated and implemented in an open-source tool called RICK. We evaluate our approach with three real-world, opensource Java applications. RICK monitors the invocation of 128 methods in production across the three applications and captures their behavior. Based on this captured data, RICK generates test cases that include realistic initial states and test inputs, as well as mocks and stubs. All the generated test cases are executable, and 52.4% of them successfully mimic the complete execution context of the target methods observed in production. The mock-based oracles are also effective at detecting regressions within the target methods, complementing each other in their fault-finding ability. We interview 5 developers from the industry who confirm the relevance of using production observations to design mocks and stubs. Our experimental findings clearly demonstrate the feasibility and added value of generating mocks from production interactions. }, year = {2024} } @unpublished{Etemadi1907143, author = {Etemadi, Khashayar and Mohammadi, Bardia and Zhendong, Su and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20241023}, title = {Mokav: Execution-driven Differential Testing with LLMs}, } @article{Etemadi1997514, author = {Etemadi, Khashayar and Mohammadi, Bardia and Su, Zhendong and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {ETH Zurich, Switzerland; KTH Royal Institute of Technology, Sweden}, institution = {Sharif University of Technology, Iran}, institution = {ETH Zurich, Switzerland}, journal = {Journal of Systems and Software}, note = {QC 20250912}, eid = {112571}, publisher = {Elsevier BV}, title = {Mokav: Execution-driven differential testing with LLMs}, volume = {230}, DOI = {10.1016/j.jss.2025.112571}, keywords = {Behavioral difference, Large language models, Test generation}, abstract = {It is essential to detect functional differences between programs in various software engineering tasks, such as automated program repair, mutation testing, and code refactoring. The problem of detecting functional differences between two programs can be reduced to searching for a difference exposing test (DET): a test input that results in different outputs on the subject programs. In this paper, we propose MOKAV, a novel execution-driven tool that leverages LLMs to generate DETs. MOKAV takes two versions of a program (P and Q) and an example test input. When successful, MOKAV generates a valid DET, a test input that leads to provably different outputs on P and Q. MOKAV iteratively prompts an LLM with a specialized prompt to generate new test inputs. At each iteration, MOKAV provides execution-based feedback from previously generated tests until the LLM produces a DET. We evaluate MOKAV on 1535 pairs of Python programs collected from the Codeforces competition platform and 32 pairs of programs from the QuixBugs dataset. Our experiments show that MOKAV outperforms the state-of-the-art, Pynguin and Differential Prompting, by a large margin. MOKAV can generate DETs for 81.7% (1,255/1535) of the program pairs in our benchmark (versus 4.9% for Pynguin and 37.3% for Differential Prompting). We demonstrate that the iterative and execution-driven feedback components of the system contribute to its high effectiveness. }, year = {2025} } @techreport{Monperrus1953484, author = {Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20250424}, pages = {61}, title = {Most Cited Papers in Software Engineering 2013-2023}, DOI = {10.5281/zenodo.14885765}, abstract = {This compilation presents a list of the most cited research papers in software engineering from 2013 to 2023, published in leading academic venues. By leveraging APIs from CrossRef and Semantic Scholar, we systematically gather and rank influential works based on citation metrics, providing a valuable resource for researchers, educators, and industry professionals to understand the field. This document can also serve for individuals to strengthen their academic credits with impact facts. Full bibliometric data is accessible in the accompanying repository. }, URL = {https://github.com/ASSERT-KTH/most-cited-se-papers}, year = {2025} } @inproceedings{Gu1660002, author = {Gu, Jian and Chen, Zimin and Monperrus, Martin}, booktitle = {2021 IEEE international conference on software maintenance and evolution (ICSME 2021) : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Zurich, Zurich, Switzerland.}, note = {QC 20220819Part of proceedings: ISBN 978-1-6654-2882-8}, pages = {483--494}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Multimodal Representation for Neural Code Search}, series = {Proceedings-IEEE International Conference on Software Maintenance}, DOI = {10.1109/ICSME52107.2021.00049}, keywords = {multimodal learning, program representation, information completeness, tree serialization, code search}, abstract = {Semantic code search is about finding semantically relevant code snippets for a given natural language query. In the state-of-the-art approaches, the semantic similarity between code and query is quantified as the distance of their representation in the shared vector space. In this paper, to improve the vector space, we introduce tree-serialization methods on a simplified form of AST and build the multimodal representation for the code data. We conduct extensive experiments using a single corpus that is large-scale and multi-language: CodeSearchNet. Our results show that both our tree-serialized representations and multimodal learning model improve the performance of code search. Last, we define intuitive quantification metrics oriented to the completeness of semantic and syntactic information of the code data, to help understand the experimental findings. }, year = {2021} } @unpublished{CabreraArteaga1694409, author = {Cabrera Arteaga, Javier and Laperdrix, Pierre and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20220912}, title = {Multi-variant Execution at the Edge}, keywords = {Diversification, Moving Target Defense, Edge-Cloud computing, Multivariant execution, WebAssembly}, abstract = {Edge-Cloud computing offloads parts of the computations that traditionally occurs in the cloud to edge nodes. The binary format WebAssembly is increasingly used to distribute and deploy services on such platforms. Edge-Cloud computing providers let their clients deploy stateless services inthe form of WebAssembly binaries, which are then translated to machine code, sandboxed and executed at the edge. In this context, we propose atechnique that (i) automatically diversifies WebAssembly binaries that are deployed to the edge and (ii) randomizes execution paths at runtime. Thus, an attacker cannot exploit all edge nodes with the same payload. Given aservice, we automatically synthesize functionally equivalent variants for thefunctions providing the service. All the variants are then wrapped into a singlemultivariant WebAssembly binary. When the service endpoint is executed, every time a function is invoked, one of its variants is randomly selected. We implement this technique in the MEWE tool and we validate it with 7 servicesfor which MEWE generates multivariant binaries that embed hundreds of function variants. We execute the multivariant binaries on the world-wide edge platform provided by Fastly, as part as a research collaboration. We show that multivariant binaries exhibit a real diversity of execution traces across the whole edge platform distributed around the globe. }, } @inproceedings{CabreraArteaga1785390, author = {Cabrera Arteaga, Javier and Laperdrix, Pierre and Monperrus, Martin and Baudry, Benoit}, booktitle = {MTD 2022 : Proceedings of the 9th ACM Workshop on Moving Target Defense, co-located with CCS 2022}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Centre National de la Recherche Scientifique CNRS, Paris, France}, note = {Part of ISBN 9781450398787QC 20230802}, pages = {11--22}, publisher = {Association for Computing Machinery (ACM)}, title = {Multi-variant Execution at the Edge}, DOI = {10.1145/3560828.3564007}, keywords = {diversification, edge-cloud computing, moving target defense, multivariant execution, webassembly}, abstract = {Edge-Cloud computing offloads parts of the computations that traditionally occurs in the cloud to edge nodes. The binary format WebAssembly is increasingly used to distribute and deploy services on such platforms. Edge-Cloud computing providers let their clients deploy stateless services in the form of WebAssembly binaries, which are then translated to machine code, sandboxed and executed at the edge. In this context, we propose a technique that (i) automatically diversifies WebAssembly binaries that are deployed to the edge and (ii) randomizes execution paths at runtime. Thus, an attacker cannot exploit all edge nodes with the same payload. Given a service, we automatically synthesize functionally equivalent variants for the functions providing the service. All the variants are then wrapped into a single multivariant WebAssembly binary. When the service endpoint is executed, every time a function is invoked, one of its variants is randomly selected. We implement this technique in the MEWE tool and we validate it with 7 services for which MEWE generates multivariant binaries that embed hundreds of function variants. We execute the multivariant binaries on the world-wide edge platform provided by Fastly, as part as a research collaboration. We show that multivariant binaries exhibit a real diversity of execution traces across the whole edge platform distributed around the globe. }, year = {2022} } @inproceedings{Ye1692854, author = {Ye, He and Martinez, Matias and Monperrus, Martin}, booktitle = {ICSE '22: Proceedings of the 44th International Conference on Software Engineering : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Polytech Hauts de France, Valenciennes, France.}, note = {QC 20221109Part of proceedings: ISBN 978-145039221-1}, pages = {1506--1518}, publisher = {Association for Computing Machinery (ACM)}, title = {Neural Program Repair with Execution-based Backpropagation}, series = {International Conference on Software Engineering}, DOI = {10.1145/3510003.3510222}, abstract = {Neural machine translation (NMT) architectures have achieved promising results for automatic program repair. Yet, they have the limitation of generating low-quality patches (e.g., not compilable patches). This is because the existing works only optimize a purely syntactic loss function based on characters and tokens without incorporating program-specific information during neural network weight optimization. In this paper, we propose a novel program repair model called RewardRepair. The core novelty of RewardRepair is to improve NMT-based program repair with a loss function based on program compilation and test execution information, rewarding the network to produce patches that compile and that do not overfit. We conduct several experiments to evaluate RewardRepair showing that it is feasible and effective to use compilation and test execution results to optimize the underlying neural repair model. RewardRepair correctly repairs 207 bugs over four benchmarks. we report on repair success for 121 bugs that are fixed for the first time in the literature. Also, RewardRepair produces up to 45.3% of compilable patches, an improvement over the 39% by the state-of-the-art. }, year = {2022} } @article{Chen1706456, author = {Chen, Zimin and Kommrusch, Steve James and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Colorado State University, USA}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20231117}, number = {1}, pages = {147--165}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Neural Transfer Learning for Repairing Security Vulnerabilities in C Code}, volume = {49}, DOI = {10.1109/TSE.2022.3147265}, keywords = {Codes, Computer bugs, seq2seq learning, Software, Task analysis, Training, transfer learning, Transformers, vulnerability fixing, C (programming language), Costs, Deep learning, Job analysis, Knowledge management, Personnel training, Program debugging, Code, Security vulnerabilities, Transformer, Repair}, abstract = {In this paper, we address the problem of automatic repair of software vulnerabilities with deep learning. The major problem with data-driven vulnerability repair is that the few existing datasets of known confirmed vulnerabilities consist of only a few thousand examples. However, training a deep learning model often requires hundreds of thousands of examples. In this work, we leverage the intuition that the bug fixing task and the vulnerability fixing task are related and that the knowledge learned from bug fixes can be transferred to fixing vulnerabilities. In the machine learning community, this technique is called transfer learning. In this paper, we propose an approach for repairing security vulnerabilities named VRepair which is based on transfer learning. VRepair is first trained on a large bug fix corpus and is then tuned on a vulnerability fix dataset, which is an order of magnitude smaller. In our experiments, we show that a model trained only on a bug fix corpus can already fix some vulnerabilities. Then, we demonstrate that transfer learning improves the ability to repair vulnerable C functions. We also show that the transfer learning model performs better than a model trained with a denoising task and fine-tuned on the vulnerability fixing task. To sum up, this paper shows that transfer learning works well for repairing security vulnerabilities in C compared to learning on a small dataset. }, year = {2023} } @article{Simonsson1368908, author = {Simonsson, Jesper and Zhang, Long and Morin, Brice and Baudry, Benoit and Monperrus, Martin}, institution = {KTH}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Tellu, Pb. 440 Asker, 1373 Asker, Norway}, journal = {Future Generation Computer Systems}, note = {QC 20210614}, pages = {117--129}, publisher = {Elsevier BV}, title = {Observability and Chaos Engineering on System Calls for Containerized Applications in Docker}, volume = {122}, DOI = {10.1016/j.future.2021.04.001}, keywords = {fault injection, chaos engineering, system call, containers, observability}, abstract = {In this paper, we present a novel fault injection system called ChaosOrca for system calls in containerized applications. ChaosOrca aims at evaluating a given application's self-protection capability with respect to system call errors. The unique feature of ChaosOrca is that it conducts experiments under production-like workload without instrumenting the application. We exhaustively analyze all kinds of system calls and utilize different levels of monitoring techniques to reason about the behaviour under perturbation. We evaluate ChaosOrca on three real-world applications: a file transfer client, a reverse proxy server and a micro-service oriented web application. Our results show that it is promising to detect weaknesses of resilience mechanisms related to system calls issues. }, URL = {https://arxiv.org/abs/1907.13039}, URL = {https://www.sciencedirect.com/science/article/pii/S0167739X21001163}, year = {2021} } @inproceedings{Etemadi1528715, author = {Etemadi, Khashayar and Monperrus, Martin}, booktitle = {Proceedings - 2020 IEEE/ACM 42nd International Conference on Software Engineering Workshops, ICSEW 2020 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20210216}, pages = {470--475}, publisher = {Association for Computing Machinery, Inc}, title = {On the Relevance of Cross-project Learning with Nearest Neighbours for Commit Message Generation}, DOI = {10.1145/3387940.3391488}, keywords = {commit message generation, nearest neighbor algorithm, neural machine translation, Computer aided language translation, Technical presentations, Generation method, High quality, Machine translations, Nearest neighbour, Project learning, Software maintenance and evolution, Software engineering}, abstract = {Commit messages play an important role in software maintenance and evolution. Nonetheless, developers often do not produce high-quality messages. A number of commit message generation methods have been proposed in recent years to address this problem. Some of these methods are based on neural machine translation (NMT) techniques. Studies show that the nearest neighbor algorithm (NNGen) outperforms existing NMT-based methods, although NNGen is simpler and faster than NMT. In this paper, we show that NNGen does not take advantage of cross-project learning in the majority of the cases. We also show that there is an even simpler and faster variation of the existing NNGen method which outperforms it in terms of the BLEU_4 score without using cross-project learning. }, year = {2020} } @article{Tiwari1639558, author = {Tiwari, Deepika and Zhang, Long and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, journal = {IEEE Transactions on Reliability}, note = {QC 20250326}, number = {3}, pages = {1381--1397}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Production Monitoring to Improve Test Suites}, volume = {71}, DOI = {10.1109/tr.2021.3101318}, keywords = {Production monitoring, test generation, test improvement, test oracle, test quality}, abstract = {In this article, we propose to use production executions to improve the quality of testing for certain methods of interest for developers. These methods can be methods that are not covered by the existing test suite or methods that are poorly tested. We devise an approach called pankti which monitors applications as they execute in production and then automatically generates differential unit tests, as well as derived oracles, from the collected data. pankti’s monitoring and generation focuses on one single programming language, Java. We evaluate it on three real-world, open-source projects: a videoconferencing system, a PDF manipulation library, and an e-commerce application. We show that pankti is able to generate differential unit tests by monitoring target methods in production and that the generated tests improve the quality of the test suite of the application under consideration. }, year = {2022} } @article{Baudry1885021, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Montreal, Software Engn, Montreal, PQ H3T 1J4, Canada}, journal = {Computer}, note = {QC 20240719}, number = {7}, pages = {104--108}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Programming Art With Drawing Machines}, volume = {57}, DOI = {10.1109/MC.2024.3385049}, abstract = {Algorithmic artists master programming to create art. Specialized libraries and hardware devices such as pen plotters support their practice. }, year = {2024} } @inproceedings{Tiwari1911847, author = {Tiwari, Deepika and Gamage, Yogya and Monperrus, Martin and Baudry, Benoit}, booktitle = {Proceedings - 2024 IEEE International Conference on Source Code Analysis and Manipulation, SCAM 2024 : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université de Montréal, Montréal, Canada}, institution = {Université de Montréal, Montréal, Canada}, note = {Part of ISBN 9798331528508QC 20241111}, pages = {166--176}, title = {PROZE: Generating Parameterized Unit Tests Informed by Runtime Data}, DOI = {10.1109/SCAM63643.2024.00025}, abstract = {Typically, a conventional unit test (CUT) verifies the expected behavior of the unit under test through one specific input / output pair. In contrast, a parameterized unit test (PUT) receives a set of inputs as arguments, and contains assertions that are expected to hold true for all these inputs. PUTs increase test quality, as they assess correctness on a broad scope of inputs and behaviors. However, defining assertions over a set of inputs is a hard task for developers, which limits the adoption of PUTs in practice. In this paper, we address the problem of finding oracles for PUTs that hold over multiple inputs. We design a system called PROZE, that generates PUTs by identifying developer-written assertions that are valid for more than one test input. We implement our approach as a two-step methodology: first, at runtime, we collect inputs for a target method that is invoked within a CUT; next, we isolate the valid assertions of the CUT to be used within a PUT. We evaluate our approach against 5 real-world Java modules, and collect valid inputs for 128 target methods, from test and field executions. We generate 2,287 PUTs, which invoke the target methods with a significantly larger number of test inputs than the original CUTs. We execute the PUTs and find 217 that provably demonstrate that their oracles hold for a larger range of inputs than envisioned by the developers. From a testing theory perspective, our results show that developers express assertions within CUTs, which actually hold beyond one particular input. }, URL = {https://conf.researchr.org/details/scam-2024/SCAM-2024-research-track/10/PROZE-Generating-Parameterized-Unit-Tests-Informed-by-Runtime-Data}, URL = {https://conf.researchr.org/home/scam-2024}, URL = {http://doi.org/10.48550/arXiv.2407.00768}, year = {2024} } @inproceedings{Silva2032293, author = {Silva, Andre and Monperrus, Martin}, booktitle = {2025 IEEE/Acm International Workshop On Large Language Models For Code, LLM4Code : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20260126}, pages = {9--16}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {RepairBench : Leaderboard of Frontier Models for Program Repair}, DOI = {10.1109/LLM4Code66737.2025.00006}, keywords = {leaderboard, benchmark, program repair, large language models}, abstract = {AI-driven program repair uses AI models to repair buggy software by producing patches. Rapid advancements in frontier models surely impact performance on the program repair task. Yet, there is a lack of frequent and standardized evaluations to actually understand the strengths and weaknesses of models. To that end, we propose RepairBench, a novel leaderboard for AI-driven program repair. The key characteristics of RepairBench are: 1) it is execution-based: all patches are compiled and executed against a test suite, 2) it assesses frontier models in a frequent and standardized way. RepairBench leverages two high-quality benchmarks, Defects4J and GitBug-Java, to evaluate frontier models only against real-world program repair tasks. At the time of writing, RepairBench shows that claude-3-5-sonnet-20241022 is the best model for program repair, and deepseek-v3 one of the cheapest while ranking third. We publicly release the evaluation framework of RepairBench as well as all patches generated in the course of the evaluation. }, ISBN = {979-8-3315-2616-0}, ISBN = {979-8-3315-2615-3}, year = {2025} } @article{Silva1990730, author = {Silva, Andre and Fang, Sen and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {NC State University, USA}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250821}, number = {8}, pages = {2366--2380}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {RepairLLaMA : Efficient Representations and Fine-Tuned Adapters for Program Repair}, volume = {51}, DOI = {10.1109/TSE.2025.3581062}, keywords = {Automated Program Repair, Code Representations, Large Language Models, Parameter-Efficient Fine-Tuning}, abstract = {Automated Program Repair (APR) has evolved significantly with the advent of Large Language Models (LLMs). Fine-tuning LLMs for program repair is a recent avenue of research, with many dimensions which have not been explored. Existing work mostly fine-tune LLMs with naive code representations and does not scale to frontier models. To address this problem, we propose RepairLLaMA, a novel program repair approach that 1) identifies optimal code representations for APR with fine-tuned models, and 2) pioneers state-of-the-art parameter-efficient fine-tuning technique (PEFT) for program repair. This results in RepairLLaMA producing a highly effective ‘program repair adapter’ for fixing bugs with AI. Our experiments demonstrate the validity of both concepts. First, fine-tuning adapters with program repair specific code representations enables the model to use meaningful repair signals and produce better patches. Second, parameter-efficient fine-tuning helps fine-tuning to converge and clearly contributes to the effectiveness of RepairLLaMA in fixing bugs outside the fine-tuning data distribution. Overall, RepairLLaMA correctly fixes 144 Defects4J v2, 109 HumanEval-Java, and 20 GitBug-Java bugs, outperforming all baselines. }, year = {2025} } @inproceedings{Tiwari1784772, author = {Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, booktitle = {2023 IEEE CONFERENCE ON SOFTWARE TESTING, VERIFICATION AND VALIDATION, ICST : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20231127}, pages = {464--466}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {RICK : Generating Mocks from Production Data}, series = {IEEE International Conference on Software Testing Verification and Validation Workshops}, DOI = {10.1109/ICST57152.2023.00051}, keywords = {mocks, stubs, production, oracles, testing tool}, abstract = {Test doubles, such as mocks and stubs, are nifty fixtures in unit tests. They allow developers to test individual components in isolation from others that lie within or outside of the system. However, implementing test doubles within tests is not straightforward. With this demonstration, we introduce RICK, a tool that observes executing applications in order to automatically generate tests with realistic mocks and stubs. RICK monitors the invocation of target methods and their interactions with external components. Based on the data collected from these observations, RICK produces unit tests with mocks, stubs, and mock-based oracles. We highlight the capabilities of RICK, and how it can be used with real-world Java applications, to generate tests with mocks. }, year = {2023} } @inproceedings{CabreraArteaga1365299, author = {Cabrera Arteaga, Javier and Monperrus, Martin and Baudry, Benoit}, booktitle = {Proceedings of the 11th ACM SIGPLAN International Workshop on Virtual Machines and Intermediate Languages, VMIL@SPLASH : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, note = {Part of proceedings ISBN 978-1-4503-6987-9QC 20191028}, pages = {22--31}, eid = {3361228}, title = {Scalable comparison of JavaScript V8 bytecode traces}, DOI = {10.1145/3358504.3361228}, abstract = {The comparison and alignment of runtime traces are essential, e.g., for semantic analysis or debugging. However, naive sequence alignment algorithms cannot address the needs of the modern web: (i) the bytecode generation process of V8 is not deterministic; (ii) bytecode traces are large. We present STRAC, a scalable and extensible tool tailored to compare bytecode traces generated by the V8 JavaScript engine. Given two V8 bytecode traces and a distance function between trace events, STRAC computes and provides the best alignment. The key insight is to split access between memory and disk. STRAC can identify semantically equivalent web pages and is capable of processing huge V8 bytecode traces whose order of magnitude matches today's web like https://2019.splashcon.org, which generates approx. 150k of V8 bytecode instructions. }, year = {2019} } @misc{Baudry1590883, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20210906}, title = {Science-changing Code}, URL = {https://cacm.acm.org/blogs/blog-cacm/252759-science-changing-code}, year = {2021} } @article{Uchitel1918205, author = {Uchitel, Sebastian and Monperrus, Martin and Zhong, Hao}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Universidad de Buenos Aires, Argentina}, institution = {Shanghai Jiao Tong University, Department of Computer Science and Engineering, China}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250120}, number = {11}, pages = {2709--2711}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Scoping Software Engineering for AI : The TSE Perspective}, volume = {50}, DOI = {10.1109/TSE.2024.3470368}, year = {2024} } @inproceedings{Ye1735153, author = {Ye, He and Martinez, Matias and Luo, Xiapu and Zhang, Tao and Monperrus, Martin}, booktitle = {Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Université Polytechnique Hauts-de-France, France}, note = {QC 20230214}, publisher = {Association for Computing Machinery (ACM)}, title = {SelfAPR : Self-Supervised Program Repair with Test Execution Diagnostics}, series = {ASE ’22}, DOI = {10.1145/3551349.3556926}, abstract = {Learning-based program repair has achieved good results in a recent series of papers. Yet, we observe that the related work fails to repair some bugs because of a lack of knowledge about 1) the application domain of the program being repaired, and 2) the fault type being repaired. In this paper, we solve both problems by changing the learning paradigm from supervised training to self-supervised training in an approach called SelfAPR. First, SelfAPR generates training samples on disk by perturbing a previous version of the program being repaired, enforcing the neural model to capture project-specific knowledge. This is different from the previous work based on mined past commits. Second, SelfAPR executes all training samples and extracts and encodes test execution diagnostics into the input representation, steering the neural model to fix the kind of fault. This is different from the existing studies that only consider static source code as input. We implement SelfAPR and evaluate it in a systematic manner. We generate 1 039 873 training samples obtained by perturbing 17 open-source projects. We evaluate SelfAPR on 818 bugs from Defects4J, SelfAPR correctly repairs 110 of them, outperforming all the supervised learning repair approaches. }, URL = {https://doi.org/10.1145/3551349.3556926}, year = {2023} } @article{Kommrusch1791114, author = {Kommrusch, Steve and Monperrus, Martin and Pouchet, Louis-Noel}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Colorado State Univ, Ft Collins, CO 80523 USA.}, institution = {Colorado State Univ, Ft Collins, CO 80523 USA.}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20231127}, number = {7}, pages = {3771--3792}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Self-Supervised Learning to Prove Equivalence Between Straight-Line Programs via Rewrite Rules}, volume = {49}, DOI = {10.1109/TSE.2023.3271065}, keywords = {Machine learning, program equivalence, self-supervised learning, symbolic reasoning}, abstract = {We target the problem of automatically synthesizing proofs of semantic equivalence between two programs made of sequences of statements. We represent programs using abstract syntax trees (AST), where a given set of semantics-preserving rewrite rules can be applied on a specific AST pattern to generate a transformed and semantically equivalent program. In our system, two programs are equivalent if there exists a sequence of application of these rewrite rules that leads to rewriting one program into the other. We propose a neural network architecture based on a transformer model to generate proofs of equivalence between program pairs. The system outputs a sequence of rewrites, and the validity of the sequence is simply checked by verifying it can be applied. If no valid sequence is produced by the neural network, the system reports the programs as non-equivalent, ensuring by design no programs may be incorrectly reported as equivalent. Our system is fully implemented for one single grammar which can represent straight-line programs with function calls and multiple types. To efficiently train the system to generate such sequences, we develop an original incremental training technique, named self-supervised sample selection. We extensively study the effectiveness of this novel training approach on proofs of increasing complexity and length. Our system,S4Eq, achieves 97% proof success on a curated dataset of 10,000 pairs of equivalent programs. }, year = {2023} } @article{Wachter2042112, author = {Wachter, Julian and Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Theoretical Computer Science}, institution = {Karlsruhe Inst Technol, Karlsruhe, Germany}, journal = {Journal of Systems and Software}, note = {QC 20260226}, eid = {112721}, publisher = {Elsevier BV}, title = {Serializing java objects in plain code}, volume = {234}, DOI = {10.1016/j.jss.2025.112721}, keywords = {Code, Serialization, Objects on disk, Runtime, Java}, abstract = {In managed languages, serialization of objects is typically done in bespoke binary formats such as Protobuf, or markup languages such as XML or JSON. The major limitation of these formats is readability. Human developers cannot read binary code, and in most cases, suffer from the syntax of XML or JSON. This is a major issue when objects are meant to be embedded and read in source code, such as in test cases. To address this problem, we propose plain-code serialization. Our core idea is to serialize objects observed at runtime in the native syntax of a programming language. We realize this vision in the context of Java, and demonstrate a prototype which serializes Java objects to Java source code. The resulting source faithfully reconstructs the objects seen at runtime. Our prototype is called PRODJand is publicly available. We experiment with PRODJto successfully plain-code serialize 174, 699 objects observed during the execution of 4 open-source Java applications. Our performance measurement shows that the performance impact is not noticeable. Through a user study, we demonstrate that developers prefer plain-code serialized objects within automatically generated tests over their representations as XML or JSON. }, year = {2026} } @unpublished{Eshghie1956137, author = {Eshghie, Mojtaba and Andersson Kasche, Gustav and Artho, Cyrille and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20250505}, title = {SInDi: Semantic Invariant Differencing For Solidity Smart Contracts}, keywords = {Invariants, Symbolic Analysis, Semantic Equivalence, Smart Contracts, Pre-/Post-Conditions}, abstract = {Advancements in invariants-based smart contract analysis and verification call for tools that reliably and efficiently check semantic difference between invariants.These invariants, logical expressions that should hold in blockchain transactions are enforced through require/assert statements in Solidity smart contracts.  We present SInDi, a semantic invariant differencing tool for Solidity contracts that symbolically checks the differences of any two given invariants and quickly generates a verdict. Our evaluation on real-world smart contracts demonstrates SInDi's accuracy of 100% and efficiency of 0.09 seconds on average per verdict compared to human verdicts. Furthermore, we develop an invariant denoising pipeline based on SInDi that effectively removes up to 41.8% of weak dynamically-mined invariants to facilitate further analysis and verification tasks based on these auto-generated invariants. }, } @article{Bobadilla1849847, author = {Bobadilla, Sofia and Glassey, Richard and Bergel, Alexandre and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {RelationalAI, Bern, Switzerland.}, journal = {IEEE Software}, note = {QC 20240701}, number = {2}, pages = {68--76}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {SOBO : A Feedback Bot to Nudge Code Quality in Programming Courses}, volume = {41}, DOI = {10.1109/MS.2023.3298729}, keywords = {Codes, Chatbots, Education, Task analysis, Software development management, Programming profession, Software engineering}, abstract = {This paper presents SOBO, a bot we designed to automatically provide feedback on code quality to undergraduate students. SOBO has been deployed in a course at the KTH Royal Institute of Technology in Sweden with more than 130 students. }, year = {2024} } @inproceedings{Bobadilla1990017, author = {Bobadilla, Sofia and Glassey, Richard and Bergel, Alexandre and Monperrus, Martin}, booktitle = {Proceedings - 2025 IEEE/ACM 37th International Conference on Software Engineering Education and Training, CSEE and T 2025 : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {RelationalAI, Bern, Switzerland}, note = {Part of ISBN 9798331537098QC 20250819}, pages = {229--}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {SOBO: A Feedback Bot to Nudge Code Quality in Programming Courses}, DOI = {10.1109/CSEET66350.2025.00029}, keywords = {bots, computer science, education, software engineering}, abstract = {Recent research has shown the great potential of automatic feedback in education. This paper presents SOBO, a bot we designed to automatically provide feedback on code quality to undergraduate students. SOBO has been deployed in a course at the KTH Royal Institute of Technology in Sweden with 130+ students. Overall, SOBO has analyzed 1687 GitHub repositories and produced 8443 tailored code quality feedback messages to students. Unlike traditional tools embedded in CI pipelines, SOBO is designed to interact with students in a way that promotes personalized learning without imposing additional teaching burdens. The quantitative and qualitative results indicate that SOBO effectively nudges students into adopting code quality best practices, without interfering with pedagogical objectives. From this experience, we provide guidelines on how to design and deploy teaching bots in programming courses. }, year = {2025} } @inproceedings{Balliu1823042, author = {Balliu, Musard and Baudry, Benoit and Bobadilla, Sofia and Ekstedt, Mathias and Monperrus, Martin and Ron Arteaga, Javier and Sharma, Aman and Skoglund, Gabriel and Soto Valero, C{\’e;}sar and Wittlinger, Martin}, booktitle = {SCORED 2023 - Proceedings of the 2023 Workshop on Software Supply Chain Offensive Research and Ecosystem Defenses : }, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Network and Systems Engineering}, note = {Part of proceedings ISBN 9798400702631QC 20231229}, pages = {75--76}, publisher = {Association for Computing Machinery (ACM)}, title = {Software Bill of Materials in Java}, DOI = {10.1145/3605770.3625207}, keywords = {sbom, software supply chain}, abstract = {Modern software applications are virtually never built entirely in-house. As a matter of fact, they reuse many third-party dependencies, which form the core of their software supply chain [1]. The large number of dependencies in an application has turned into a major challenge for both security and reliability. For example, to compromise a high-value application, malicious actors can choose to attack a less well-guarded dependency of the project [2]. Even when there is no malicious intent, bugs can propagate through the software supply chain and cause breakages in applications. Gathering accurate, upto- date information about all dependencies included in an application is, therefore, of vital importance. }, year = {2023} } @article{Etemadi1730207, author = {Etemadi, Khashayar and Harrand, Nicolas and Lars{\’e;}n, Simon and Adzemovic, Haris and Luong Phu, Henry and Verma, Ashutosh and Madeiral, Fernanda and Wikstr{\"o}m, Douglas and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {Compute Science, IIT Bombay, 29491 Mumbai, Maharashtra, India}, journal = {IEEE Transactions on Dependable and Secure Computing}, note = {QC 20250513}, number = {4}, pages = {2794--2810}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Sorald : Automatic Patch Suggestions for SonarQube Static Analysis Violations}, volume = {20}, DOI = {10.1109/TDSC.2022.3167316}, keywords = {automatic program repair, Codes, Computer bugs, Java, Maintenance engineering, metaprogramming, Software development management, Static analysis, Static code analysis, Syntactics, Codes (symbols), Computer software, Java programming language, Program debugging, Repair, Software design, Trees (mathematics), Automatic programs, Code, Meta Programming, Static analyzers, Static codes}, abstract = {Previous work has shown that early resolution of issues detected by static code analyzers can prevent major costs later on. However, developers often ignore such issues for two main reasons. First, many issues should be interpreted to determine if they correspond to actual flaws in the program. Second, static analyzers often do not present the issues in a way that is actionable. To address these problems, we present Sorald: a novel system that uses metaprogramming templates to transform the abstract syntax trees of programs and suggests fixes for static analysis warnings. Thus, the burden on the developer is reduced from interpreting and fixing static issues, to inspecting and approving full fledged solutions. Sorald fixes violations of 10 rules from SonarJava, one of the most widely used static analyzers for Java. We evaluate Sorald on a dataset of 161 popular repositories on Github. Our analysis shows the effectiveness of Sorald as it fixes 65% (852/1,307) of the violations that meets the repair preconditions. Overall, our experiments show it is possible to automatically fix notable violations of the static analysis rules produced by the state-of-the-art static analyzer SonarJava. }, year = {2023} } @inproceedings{White1323003, author = {White, M. and Tufano, M. and Martinez, M. and Monperrus, Martin and Poshyvanyk, D.}, booktitle = {SANER 2019 - Proceedings of the 2019 IEEE 26th International Conference on Software Analysis, Evolution, and Reengineering : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20190611Part of ISBN 9781728105918}, pages = {479--490}, eid = {8668043}, title = {Sorting and Transforming Program Repair Ingredients via Deep Learning Code Similarities}, DOI = {10.1109/SANER.2019.8668043}, keywords = {code clones, deep learning, language models, neural networks, program repair, software testing and debugging, Codes (symbols), Open source software, Program debugging, Redundancy, Reengineering, Repair, Software testing, Code clone, Code similarities, Comparative experiments, Language model, Learning-based approach, Repair techniques, Search strategies}, abstract = {In the field of automated program repair, the redundancy assumption claims large programs contain the seeds of their own repair. However, most redundancy-based program repair techniques do not reason about the repair ingredients-The code that is reused to craft a patch. We aim to reason about the repair ingredients by using code similarities to prioritize and transform statements in a codebase for patch generation. Our approach, DeepRepair, relies on deep learning to reason about code similarities. Code fragments at well-defined levels of granularity in a codebase can be sorted according to their similarity to suspicious elements (i.e., code elements that contain suspicious statements) and statements can be transformed by mapping out-of-scope identifiers to similar identifiers in scope. We examined these new search strategies for patch generation with respect to effectiveness from the viewpoint of a software maintainer. Our comparative experiments were executed on six open-source Java projects including 374 buggy program revisions and consisted of 19,949 trials spanning 2,616 days of computation time. Deep-Repair's search strategy using code similarities generally found compilable ingredients faster than the baseline, jGenProg, but this improvement neither yielded test-Adequate patches in fewer attempts (on average) nor found significantly more patches (on average) than the baseline. Although the patch counts were not statistically different, there were notable differences between the nature of DeepRepair patches and jGenProg patches. The results show that our learning-based approach finds patches that cannot be found by existing redundancy-based repair techniques. }, year = {2019} } @article{Larsen1701299, author = {Lars{\’e;}n, Simon and Falleri, J. and Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {LaBRI, UMR 5800, Universit de Bordeaux, Talence, Gironde, France, 33405}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250513}, number = {1}, pages = {64--83}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Spork : Structured Merge for Java with Formatting Preservation}, volume = {49}, DOI = {10.1109/TSE.2022.3143766}, keywords = {Codes, Computer languages, Java, Merging, Semantics, structured merge, Syntactics, Time complexity, Version control, Abstracting, Codes (symbols), Java programming language, Open source software, Software design, Trees (mathematics), Code, Highly parallels, Source codes, State of the art, State of the practice, Work-flows}, abstract = {The highly parallel workflows of modern software development have made merging of source code a common activity for developers. The state of the practice is based on line-based merge, which is ubiquitously used with git merge. Line-based merge is however a generalized technique for any text that cannot leverage the structured nature of source code, making merge conflicts a common occurrence. As a remedy, research has proposed structured merge tool, which typically operate on abstract syntax trees instead of raw text. Structured merging greatly reduces the prevalence of merge conflicts but suffers from important limitations, the main ones being a tendency to alter the formatting of the merged code and being prone to excessive running times. In this paper, we present SPORK, a novel structured merge tool for JAVA. SPORK is unique as it preserves formatting to a significantly greater degree than comparable state-of-the-art tools. SPORK is also overall faster than the state of the art, in particular significantly reducing worst-case performance in practice. We demonstrate these properties by replaying 1740 real-world file merges collected from 119 open-source projects, and further demonstrate several key differences between SPORK and the state of the art with in-depth case studies. }, year = {2023} } @article{Loriot1692859, author = {Loriot, Benjamin and Madeiral Delfim, Fernanda and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Technol Compiegne, Compiegne, France.}, journal = {Empirical Software Engineering}, note = {QC 20221101}, number = {6}, eid = {149}, publisher = {Springer Nature}, title = {Styler : learning formatting conventions to repair Checkstyle violations}, volume = {27}, DOI = {10.1007/s10664-021-10107-0}, keywords = {Coding conventions, Linter, Format checker, Checkstyle, Formatting violations, Automatic repair}, abstract = {Ensuring the consistent usage of formatting conventions is an important aspect of modern software quality assurance. To do so, the source code of a project should be checked against the formatting conventions (or rules) adopted by its development team, and then the detected violations should be repaired if any. While the former task can be automatically done by format checkers implemented in linters, there is no satisfactory solution for the latter. Manually fixing formatting convention violations is a waste of developer time and code formatters do not take into account the conventions adopted and configured by developers for the used linter. In this paper, we present Styler, a tool dedicated to fixing formatting rule violations raised by format checkers using a machine learning approach. For a given project, Styler first generates training data by injecting violations of the project-specific rules in violation-free source code files. Then, it learns fixes by feeding long short-term memory neural networks with the training data encoded into token sequences. Finally, it predicts fixes for real formatting violations with the trained models. Currently, Styler supports a single checker, Checkstyle, which is a highly configurable and popular format checker for Java. In an empirical evaluation, Styler repaired 41% of 26,791 Checkstyle violations mined from 104 GitHub projects. Moreover, we compared Styler with the IntelliJ plugin CheckStyle-IDEA and the machine-learning-based code formatters Naturalize and CodeBuff. We found out that Styler fixes violations of a diverse set of Checkstyle rules (24/25 rules), generates smaller repairs in comparison to the other systems, and predicts repairs in seconds once trained on a project. Through a manual analysis, we identified cases in which Styler does not succeed to generate correct repairs, which can guide further improvements in Styler. Finally, the results suggest that Styler can be useful to help developers repair Checkstyle formatting violations. }, year = {2022} } @inproceedings{CabreraArteaga1463635, author = {Cabrera Arteaga, Javier and Donde, Shrinish and Gu, Jian and Floros, Orestis and Satabin, Lucas and Baudry, Benoit and Monperrus, Martin}, booktitle = {Conference Companion of the 4th International Conference on Art, Science, and Engineering of Programming : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20201123}, title = {Superoptimization of WebAssembly bytecode}, DOI = {10.1145/3397537.3397567}, keywords = {WebAssembly, Web, Superoptimization, Optimization, LLVM}, abstract = {Motivated by the fast adoption of WebAssembly, we propose the first functional pipeline to support the superoptimization of WebAssembly bytecode. Our pipeline works over LLVM and Souper. We evaluate our superoptimization pipeline with 12 programs from the Rosetta code project. Our pipeline improves the code section size of 8 out of 12 programs. We discuss the challenges faced in superoptimization of WebAssembly with two case studies. }, URL = {https://dl.acm.org/doi/10.1145/3397537.3397567}, year = {2020} } @unpublished{Chen1811474, author = {Chen, Zimin and Fang, Sen and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20231120}, title = {Supersonic: Learning to Generate Source Code Optimizations in C/C++}, abstract = {Software optimization refines programs for resource efficiency while preserving functionality. Traditionally, it is a process done by developers and compilers. This paper introduces a third option, automated optimization at the source code level. We present SUPERSONIC, a neural approach targeting minor source code modifications for optimization. Using a seq2seq model, SUPERSONIC is trained on C/C++ program pairs (xt, xt+1), where xt+1 is an optimized version of xt, and outputs a diff. SUPERSONIC’s performance is benchmarked against OpenAI’s GPT-3.5-Turbo and GPT-4 on competitive programming tasks. The experiments show that SUPERSONIC not only outperforms both models on the code optimization task but also minimizes the extent of the change with a model more than 600x smaller than GPT-3.5-Turbo and 3700x smaller than GPT-4. }, year = {2023} } @article{Chen1930798, author = {Chen, Zimin and Fang, Sen and Monperrus, Martin}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250124}, number = {11}, pages = {2849--2864}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Supersonic : Learning to Generate Source Code Optimizations in C/C plus}, volume = {50}, DOI = {10.1109/TSE.2024.3423769}, keywords = {Optimization, Codes, Training, Source coding, Task analysis, Decoding, Vectors, Code optimization, Seq2Seq learning, large language model}, abstract = {Software optimization refines programs for resource efficiency while preserving functionality. Traditionally, it is a process done by developers and compilers. This paper introduces a third option, automated optimization at the source code level. We present Supersonic , a neural approach targeting minor source code modifications for optimization. Using a seq2seq model, Supersonic is trained on C/C++ program pairs ( x(t) , x(t+1) ), where x(t+1) is an optimized version of x(t) , and outputs a diff. Supersonic 's performance is benchmarked against OpenAI's GPT-3.5-Turbo and GPT-4 on competitive programming tasks. The experiments show that Supersonic not only outperforms both models on the code optimization task but also minimizes the extent of the change with a model more than 600x smaller than GPT-3.5-Turbo and 3700x smaller than GPT-4. }, year = {2024} } @article{Baudry1639161, author = {Baudry, Benoit and Monperrus, Martin}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Increment}, note = {QC 20220228}, number = {16}, publisher = {Stripe}, title = {Testing beyond coverage}, volume = {Feb}, URL = {https://increment.com/reliability/testing-beyond-coverage/}, year = {2021} } @article{Gamage2039975, author = {Gamage, Yogya and Tiwari, Deepika and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science}, institution = {Université de Montréal, Montreal, Canada}, institution = {Université de Montréal, Montreal, Canada}, journal = {Empirical Software Engineering}, note = {QC 20260219}, number = {3}, eid = {63}, publisher = {Springer Nature}, title = {The design space of lockfiles across package managers}, volume = {31}, DOI = {10.1007/s10664-025-10789-w}, abstract = {Software developers reuse third-party packages that are hosted in package registries. At build time, a package manager resolves and fetches the direct and indirect dependencies of a project. Most package managers also generate a lockfile, which records the exact set of resolved dependency versions. Lockfiles are used to reduce build times; to verify the integrity of resolved packages; and to support build reproducibility across environments and time. Despite these beneficial features, developers often struggle with their maintenance, usage, and interpretation. In this study, we unveil the major challenges related to lockfiles, such that future researchers and engineers can address them. We perform the first comprehensive study of lockfiles across seven popular package managers, npm, pnpm, Cargo, Poetry, Pipenv, Gradle, and Go. First, we highlight the wide variety of design decisions that package managers make, regarding the generation process as well as the content of lockfiles. Next, we conduct a qualitative analysis based on semi-structured interviews with 15 developers. We capture first-hand insights about the benefits that developers perceive in lockfiles, as well as the challenges they face to manage these files. Following these observations, we make five recommendations to further improve lockfiles, for a better developer experience. }, year = {2026} } @article{SotoValero1707260, author = {Soto Valero, C{\’e;}sar and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Computer}, note = {QC 20221031}, number = {10}, pages = {26--34}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {The Multibillion Dollar Software Supply Chain of Ethereum}, volume = {55}, DOI = {10.1109/MC.2022.3175542}, abstract = {Ethereum is the single largest programmable blockchain platform today. Ethereum nodes operate the blockchain, relying on a vast supply chain of third-party software dependencies. In this article, we perform an analysis of the software supply chain of Java Ethereum nodes and distill the challenges of maintaining and securing this blockchain technology. }, year = {2022} } @article{Fernandez1348181, author = {Fernandez, Daniel Mendez and Monperrus, Martin and Feldt, Robert and Zimmermann, Thomas}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Tech Univ Munich, Munich, Germany.}, institution = {Chalmers Univ Technol, Gothenburg, Sweden.;Blekinge Inst Technol, Karlskrona, Sweden.}, institution = {Microsoft Res, Redmond, WA 98052 USA.}, journal = {Empirical Software Engineering}, note = {QC 20190903}, number = {3}, pages = {1057--1060}, publisher = {SPRINGER}, title = {The open science initiative of the Empirical Software Engineering journal}, volume = {24}, DOI = {10.1007/s10664-019-09712-x}, year = {2019} } @article{Mendez1760175, author = {Mendez, Daniel and Monperrus, Martin and Feldt, Robert and Zimmermann, Thomas}, institution = {Blekinge Institute of Technology, Department of Software Engineering}, institution = {Technical University of Munich, Germany}, institution = {KTH Royal Institute of Technology}, institution = {Microsoft Research, United States}, journal = {Empirical Software Engineering}, number = {3}, pages = {1057--1060}, title = {The open science initiative of the Empirical Software Engineering journal}, volume = {24}, DOI = {10.1007/s10664-019-09712-x}, year = {2019} } @inproceedings{Harrand1430144, author = {Harrand, Nicolas and Soto Valero, C{\’e;}sar and Monperrus, Martin and Baudry, Benoit}, booktitle = {Proceedings - 19th IEEE International Working Conference on Source Code Analysis and Manipulation, SCAM 2019 : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200513Part of ISBN 9781728149370}, pages = {92--102}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {The strengths and behavioral quirks of Java bytecode decompilers}, DOI = {10.1109/SCAM.2019.00019}, keywords = {Decompilation, Java bytecode, Reverse engineering, Source code analysis, Equivalence classes, Java programming language, Open systems, Semantics, Syntactics, Java byte codes, Java source codes, Open source software projects, Quality indicators, Real world projects, Semantic equivalences, Open source software}, abstract = {During compilation from Java source code to bytecode, some information is irreversibly lost. In other words, compilation and decompilation of Java code is not symmetric. Consequently, the decompilation process, which aims at producing source code from bytecode, must establish some strategies to reconstruct the information that has been lost. Modern Java decompilers tend to use distinct strategies to achieve proper decompilation. In this work, we hypothesize that the diverse ways in which bytecode can be decompiled has a direct impact on the quality of the source code produced by decompilers. We study the effectiveness of eight Java decompilers with respect to three quality indicators: syntactic correctness, syntactic distortion and semantic equivalence modulo inputs. This study relies on a benchmark set of 14 real-world open-source software projects to be decompiled (2041 classes in total). Our results show that no single modern decompiler is able to correctly handle the variety of bytecode structures coming from real-world programs. Even the highest ranking decompiler in this study produces syntactically correct output for 84% of classes of our dataset and semantically equivalent code output for 78% of classes. }, year = {2019} } @inproceedings{Zhang1446670, author = {Zhang, Long and Monperrus, Martin}, booktitle = {Proceedings - International Symposium on Software Reliability Engineering, ISSRE : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20200624Part of ISBN 9781728149813}, pages = {116--127}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {TripleAgent : Monitoring, Perturbation and Failure-Obliviousness for Automated Resilience Improvement in Java Applications}, series = {Advances in Neural Information Processing Systems}, DOI = {10.1109/ISSRE.2019.00021}, keywords = {Dynamic analysis, Exception handling, Fault injection, Software resilience, Automation, Software reliability, Automated monitoring, Design and implements, E-mail servers, Java applications, Java virtual machines, Unique features, Java programming language}, abstract = {In this paper, we present a novel resilience improvement system for Java applications. The unique feature of this system is to combine automated monitoring, automated perturbation injection, and automated resilience improvement. The latter is achieved thanks to the failure-oblivious computing, a concept introduced in 2004 by Rinard and colleagues. We design and implement the system as agents for the Java virtual machine. We evaluate the system on two real-world applications: a file transfer client and an email server. Our results show that it is possible to automatically improve the resilience of Java applications with respect to uncaught or mishandled exceptions. }, year = {2019} } @inproceedings{Martinez1261662, author = {Martinez, M. and Monperrus, Martin}, booktitle = {10th International Symposium on Search-Based Software Engineering, SSBSE 2018 : }, institution = {KTH, Theoretical Computer Science, TCS}, note = {QC 20181108}, pages = {65--86}, title = {Ultra-large repair search space with automatically mined templates : The cardumen mode of astor}, series = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {11036}, volume = {11036}, DOI = {10.1007/978-3-319-99241-9_3}, keywords = {Automated program repair, Code templates, Patch dataset, Test-suite based repair approaches}, abstract = {Astor is a program repair library which has different modes. In this paper, we present the Cardumen mode of Astor, a repair approach based mined templates that has an ultra-large search space. We evaluate the capacity of Cardumen to discover test-suite adequate patches (aka plausible patches) over the 356 real bugs from Defects4J [11]. Cardumen finds 8935 patches over 77 bugs of Defects4J. This is the largest number of automatically synthesized patches ever reported, all patches being available in an open-science repository. Moreover, Cardumen identifies 8 unique patches, that are patches for Defects4J bugs that were never repaired in the whole history of program repair. }, ISBN = {9783319992402}, year = {2018} } @article{Oliveira1935311, author = {Oliveira, Delano and Santos, Reydne and de Oliveira, Benedito and Monperrus, Martin and Castor, Fernando and Madeiral, Fernanda}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {Univ Fed Pernambuco, BR-50732970 Recife, Brazil.;Fed Inst Pernambuco, BR-55540000 Palmares, Brazil.}, institution = {Univ Fed Pernambuco, BR-50732970 Recife, Brazil.}, institution = {Univ Fed Pernambuco, BR-50732970 Recife, Brazil.}, institution = {Univ Fed Pernambuco, BR-50732970 Recife, Brazil.;Univ Twente, NL-7522 NB Enschede, Netherlands.}, institution = {Vrije Univ Amsterdam, NL-1081 HV Amsterdam, Netherlands.}, journal = {IEEE Transactions on Software Engineering}, note = {QC 20250206}, number = {1}, pages = {14--37}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, title = {Understanding Code Understandability Improvements in Code Reviews}, volume = {51}, DOI = {10.1109/TSE.2024.3453783}, keywords = {Codes, Reviews, Source coding, Software development management, Documentation, Security, Natural languages, Code understandability, code understandability smells, code review}, abstract = {Context: Code understandability plays a crucial role in software development, as developers spend between 58% and 70% of their time reading source code. Improving code understandability can lead to enhanced productivity and save maintenance costs. Problem: Experimental studies aim to establish what makes code more or less understandable in a controlled setting, but ignore that what makes code easier to understand in the real world also depends on extraneous elements such as developers' background and project culture and guidelines. Not accounting for the influence of these factors may lead to results that are sound but have little external validity. Goal: We aim to investigate how developers improve code understandability during software development through code review comments. Our assumption is that code reviewers are specialists in code quality within a project. Method and Results: We manually analyzed 2,401 code review comments from Java open-source projects on GitHub and found that over 42% of all comments focus on improving code understandability, demonstrating the significance of this quality attribute in code reviews. We further explored a subset of 385 comments related to code understandability and identified eight categories of code understandability concerns, such as incomplete or inadequate code documentation, bad identifier, and unnecessary code. Among the suggestions to improve code understandability, 83.9% were accepted and integrated into the codebase. Among these, only two (less than 1%) ended up being reverted later. We also identified types of patches that improve code understandability, ranging from simple changes (e.g., removing unused code) to more context-dependent improvements (e.g., replacing method calling chains by existing API). Finally, we investigated the potential coverage of four well-known linters to flag the identified code understandability issues. These linters cover less than 30% of these issues, although some of them could be easily added as new rules. Implications: Our findings motivate and provide practical insight for the construction of tools to make code more understandable, e.g., understandability improvements are rarely reverted and thus can be used as reliable training data for specialized ML-based tools. This is also supported by our dataset, which can be used to train such models. Finally, our findings can also serve as a basis to develop evidence-based code style guides. }, year = {2025} } @inproceedings{Andersson1954037, author = {Andersson, Vivi and Baudry, Benoit and Bobadilla, Sofia and Christensen, Ludvig and Cofano, Serena and Etemadi, Khashayar and Liu, Raphina and Monperrus, Martin and Reyes García, Frank and Ron Arteaga, Javier and Sharma, Aman and Tiwari, Deepika and Toady, Tim}, booktitle = {SIGBOVIK : A Record of the Proceedings of SIGBOVIK 2025}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, institution = {KTH}, note = {QC 20250905}, pages = {24--35}, publisher = {SIGBOVIK}, title = {UPPERCASE IS ALL YOU NEED}, abstract = {WE PRESENT THE FIRST COMPREHENSIVE STUDY ON THE CRITICAL YET OVERLOOKED ROLE OF UPPERCASE TEXT IN ARTIFICIAL INTELLIGENCE. DESPITE CONSTITUTING A MERE SINGLE-DIGIT PERCENTAGE OF STANDARD ENGLISH PROSE, UPPERCASE LETTERS HAVE DISPROPORTIONATE POWER IN HUMAN-AI INTERACTIONS. THROUGH RIGOROUS EXPERIMENTATION INVOLVING SHOUTING AT VARIOUS LANGUAGE MODELS, WE DEMONSTRATE THAT UPPERCASE IS NOT MERELY A STYLISTIC CHOICE BUT A FUNDAMENTAL TOOL FOR AI COMMUNICATION. OUR RESULTS REVEAL THAT UPPERCASE TEXT SIGNIFICANTLY ENHANCES COMMAND AUTHORITY, CODE GENERATION QUALITY, AND – MOST CRUCIALLY – THE AI’S ABILITY TO CREATE APPROPRIATE CAT PICTURES. THIS PAPER DEFINITIVELY PROVES THAT IN THE REALM OF HUMAN-AI INTERACTION, BIGGER LETTERS == BETTER RESULTS. OUR FINDINGS SUGGEST THAT THE CAPS-LOCK KEY MAY BE THE MOST UNDERUTILIZED RESOURCE IN MODERN AI. }, URL = {https://sigbovik.org/2025/proceedings.pdf}, year = {2025} } @article{Cabrera-Arteaga1832730, author = {Cabrera-Arteaga, Javier and Fitzgerald, Nicholas and Monperrus, Martin and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, journal = {Computers & Security}, note = {QC 20240131}, pages = {103731--103731}, eid = {103731}, title = {Wasm-Mutate : Fast and effective binary diversification for WebAssembly}, volume = {139}, DOI = {10.1016/j.cose.2024.103731}, keywords = {WebAssembly, Software Diversification}, abstract = {WebAssembly is the fourth officially endorsed Web language. It is recognized because of its efficiency and design, focused on security. Yet, its swiftly expanding ecosystem lacks robust software diversification systems. We introduce Wasm-Mutate, a diversification engine specifically designed for WebAssembly. Our engine meets several essential criteria: 1) To quickly generate functionally identical, yet behaviorally diverse, WebAssembly variants, 2) To be universally applicable to any WebAssembly program, irrespective of the source programming language, and 3) Generated variants should counter side-channels. By leveraging an e-graph data structure, Wasm-Mutate is implemented to meet both speed and efficacy. We evaluate Wasm-Mutate by conducting experiments on 404 programs, which include real-world applications. Our results highlight that Wasm-Mutate can produce tens of thousands of unique and efficient WebAssembly variants within minutes. Significantly, Wasm-Mutate can safeguard WebAssembly binaries against timing side-channel attacks, especially those of the Spectre type. }, year = {2024} } @article{CabreraArteaga1781827, author = {Cabrera Arteaga, Javier and Monperrus, Martin and Toady, Tim and Baudry, Benoit}, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH, Theoretical Computer Science, TCS}, institution = {KTH, School of Electrical Engineering and Computer Science (EECS)}, journal = {Computers & Security}, note = {QC 20230711}, eid = {103296}, publisher = {Elsevier BV}, title = {WebAssembly diversification for malware evasion}, volume = {131}, DOI = {10.1016/j.cose.2023.103296}, keywords = {Cryptojacking, Malware evasion, Software diversification, WebAssembly}, abstract = {WebAssembly has become a crucial part of the modern web, offering a faster alternative to JavaScript in browsers. While boosting rich applications in browser, this technology is also very efficient to develop cryptojacking malware. This has triggered the development of several methods to detect cryptojacking malware. However, these defenses have not considered the possibility of attackers using evasion techniques. This paper explores how automatic binary diversification can support the evasion of WebAssembly cryptojacking detectors. We experiment with a dataset of 33 WebAssembly cryptojacking binaries and evaluate our evasion technique against two malware detectors: VirusTotal, a general-purpose detector, and MINOS, a WebAssembly-specific detector. Our results demonstrate that our technique can automatically generate variants of WebAssembly cryptojacking that evade the detectors in 90% of cases for VirusTotal and 100% for MINOS. Our results emphasize the importance of meta-antiviruses and diverse detection techniques and provide new insights into which WebAssembly code transformations are best suited for malware evasion. We also show that the variants introduce limited performance overhead, making binary diversification an effective technique for evasion. }, year = {2023} } @inproceedings{Tiwari1884464, author = {Tiwari, Deepika and Toady, Tim and Monperrus, Martin and Baudry, Benoit}, booktitle = {Proceedings - 2024 ACM/IEEE 46th International Conference on Software Engineering: Software Engineering in Society, ICSE-SEIS 2024 : }, institution = {KTH, Software and Computer systems, SCS}, institution = {KTH}, institution = {KTH, Theoretical Computer Science, TCS}, note = {Part of ISBN 9798400704994QC 20240716}, pages = {1--11}, publisher = {Association for Computing Machinery (ACM)}, title = {With Great Humor Comes Great Developer Engagement}, DOI = {10.1145/3639475.3640099}, keywords = {Culture, Developer engagement, Faking, Humor, Responsibility}, abstract = {The worldwide collaborative effort for the creation of software is technically and socially demanding. The more engaged developers are, the more value they impart to the software they create. Engaged developers, such as Margaret Hamilton programming Apollo 11, can succeed in tackling the most difficult engineering tasks. In this paper, we dive deep into an original vector of engagement - humor - and study how it fuels developer engagement. First, we collect qualitative and quantitative data about the humorous elements present within three significant, real-world software projects: faker, which helps developers introduce humor within their tests; lolcommits, which captures a photograph after each contribution made by a developer; and volkswagen, an exercise in satire, which accidentally led to the invention of an impactful software tool. Second, through a developer survey, we receive unique insights from 125 developers, who share their real-life experiences with humor in software. Our analysis of the three case studies highlights the prevalence of humor in software, and unveils the worldwide community of developers who are enthusiastic about both software and humor. We also learn about the caveats of humor in software through the valuable insights shared by our survey respondents. We report clear evidence that, when practiced responsibly, humor increases developer engagement and supports them in addressing hard engineering and cognitive tasks. The most actionable highlight of our work is that software tests and documentation are the best locations in code to practice humor. }, year = {2024} }