BibTeX Export of project::performance::scalasca

@INPROCEEDINGS{thaerigen_ea:2023:ProTools,
     author = {Th{\"{a}}rigen, Isabel and Hermanns, Marc-Andr{\'{e}} and Geimer, Markus},
      month = nov,
      title = {An Event Model for Trace-Based Performance Analysis of MPI Partitioned Point-to-Point Communication},
  booktitle = {Proc. of the Workshop on Programming and Performance Visualization Tools (ProTools), held in conjunction with the Supercomputing Conference (SC23), Denver, CO, USA},
       year = {2023},
      pages = {1357--1367},
  publisher = {ACM},
   location = {New York, NY, USA},
       isbn = {979-8-4007-0785-8},
        url = {https://juser.fz-juelich.de/record/1018700/files/2023_Thaerigen_ea-EventModelForPartitionedCommunication.pdf},
        doi = {10.1145/3624062.3624205}
}

@INPROCEEDINGS{,
     author = {Feld, Christian and Geimer, Markus and Hermanns, Marc-Andr{\'{e}} and Saviankou, Pavel and Visser, A. and Mohr, Bernd},
     editor = {Mix, Hartmut and Niethammer, Christoph and Zhou, Huan and Nagel, Wolfgang E. and Resch, Michael M.},
      title = {Detecting Disaster Before It Strikes: On the Challenges of Automated Building and Testing in HPC Environments},
  booktitle = {Tools for High Performance Computing 2018 / 2019},
       year = {2021},
      pages = {3-26},
  publisher = {Springer International Publishing},
       isbn = {978-3-030-66057-4},
        url = {https://juser.fz-juelich.de/record/892906/files/Detecting_disaster_before_it_strikes.pdf},
        doi = {http://dx.doi.org/10.1007/978-3-030-66057-4_1},
   abstract = {Software reliability is one of the cornerstones of any successful user experience. Software needs to build up the users’ trust in its fitness for a specific purpose. Software failures undermine this trust and add to user frustration that will ultimately lead to a termination of usage. Even beyond user expectations on the robustness of a software package, today’s scientific software is more than a temporary research prototype. It also forms the bedrock for successful scientific research in the future. A well-defined software engineering process that includes automated builds and tests is a key enabler for keeping software reliable in an agile scientific environment and should be of vital interest for any scientific software development team. While automated builds and deployment as well as systematic software testing have become common practice when developing software in industry, it is rarely used for scientific software, including tools. Potential reasons are that (1) in contrast to computer scientists, domain scientists from other fields usually never get exposed to such techniques during their training, (2) building up the necessary infrastructures is often considered overhead that distracts from the real science, (3) interdisciplinary research teams are still rare, and (4) high-performance computing systems and their programming environments are less standardized, such that published recipes can often not be applied without heavy modification. In this work, we will present the various challenges we encountered while setting up an automated building and testing infrastructure for the Score-P, Scalasca, and Cube projects. We will outline our current approaches, alternatives that have been considered, and the remaining open issues that still need to be addressed—to further increase the software quality and thus, ultimately improve user experience.}
}

@INPROCEEDINGS{wylie:2020:ProTools,
     author = {Wylie, Brian J. N.},
      month = nov,
      title = {Exascale potholes for HPC: Execution performance and variability analysis of the flagship application code HemeLB},
  booktitle = {Proc. of 2020 IEEE/ACM International Workshop on HPC User Support Tools (HUST) and the Workshop on Programming and Performance Visualization Tools (ProTools), held in conjunction with the Supercomputing Conference (SC20)},
       year = {2020},
      pages = {59--70},
  publisher = {IEEE},
       isbn = {978-0-7381-1070-7/20},
        url = {https://juser.fz-juelich.de/record/885773/files/ProTools20_A4.pdf},
        doi = {10.1109/HUSTProtools51951.2020.00014}
}

@INPROCEEDINGS{ritter_ea:2020:ipdps,
     author = {Ritter, Marcus and Calotoiu, Alexandru and Rinke, Sebastian and Reimann, Thorsten and Hoefler, Torsten and Wolf, Felix},
      month = may,
      title = {Learning Cost-Effective Sampling Strategies for Empirical Performance Modeling},
  booktitle = {Proc. of the 34th IEEE International Parallel and Distributed Processing Symposium (IPDPS), New Orleans, LA, USA},
       year = {2020},
      pages = {884--895},
  publisher = {IEEE},
       issn = {1530-2075},
       isbn = {978-1-7281-6876-0},
        doi = {10.1109/IPDPS47924.2020.00095}
}

@INPROCEEDINGS{Feld_ea:2019:IWOMP,
     author = {Feld, Christian and Convent, Simon and Hermanns, Marc-Andr{\'{e}} and Protze, Joachim and Geimer, Markus and Mohr, Bernd},
     editor = {Fan, Xing and de Supinski, Bronis R. and Sinnen, Oliver and Giacaman, Nasser},
      title = {Score-P and OMPT: Navigating the Perils of Callback-Driven Parallel Runtime Introspection},
  booktitle = {Proc. of the 15th International Workshop on OpenMP (IWOMP 2019, September 11–13, 2019, Auckland, New Zealand)},
     series = {Lecture Notes in Computer Science},
     volume = {11718},
       year = {2019},
      pages = {21--35},
  publisher = {Springer, Cham},
   location = {Auckland, New Zealand},
        doi = {10.1007/978-3-030-28596-8_2}
}

@ARTICLE{Hermanns_ea:2019:MPITEvents,
    author = {Hermanns, Marc-Andr{\'{e}} and Hjelm, Nathan T. and Knobloch, Michael and Mohror, Kathryn and Schulz, Martin},
  keywords = {Callback functions, MPI, Performance measurement, Runtime introspection, Tool interfaces},
     title = {The MPI_T events interface: An early evaluation and overview of the interface},
   journal = {Parallel Computing},
    volume = {85},
      year = {2019},
     pages = {119 - 130},
      issn = {0167-8191},
       url = {http://www.sciencedirect.com/science/article/pii/S0167819118303314},
       doi = {https://doi.org/10.1016/j.parco.2018.12.006},
  abstract = {Understanding the behavior of parallel applications that use the Message Passing Interface (MPI) is critical for optimizing communication performance. Performance tools for MPI currently rely on the PMPI Profiling Interface or the MPI Tool Information Interface, MPI_T, for portably collecting information for performance measurement and analysis. While tools using these interfaces have proven to be extremely valuable for performance tuning, these interfaces only provide synchronous information, i.e., when an MPI or an MPI_T function is called. There is currently no option for collecting information about asynchronous events from within the MPI library. In this work we propose a callback-driven interface for event notification from MPI implementations. Our approach is integrated in the existing MPI_T interface and provides a portable API for tools to discover and register for events of interest. We implement our MPI_T Events interface in Open MPI and demonstrate its functionality and usability with a small logging tool (MEL) as well as an early integration into the comprehensive measurement infrastructure Score-P.}
}

@INPROCEEDINGS{lehr_ea:SC:2019,
     author = {Lehr, Jan-Patrick and Calotoiu, Alexandru and Bischof, Christian and Wolf, Felix},
      month = nov,
      title = {Automatic Instrumentation Refinement for Empirical Performance Modeling},
  booktitle = {Proc. of the Workshop on Programming and Performance Visualization Tools (ProTools), held in conjunction with the Supercomputing Conference (SC19), Denver, CO, USA},
       year = {2019},
      pages = {40--47},
       isbn = {978-1-7281-6026-9},
        doi = {10.1109/ProTools49597.2019.00011}
}

@INPROCEEDINGS{calotoiu_ea:SC:2019,
     author = {Calotoiu, Alexandru and H{\"{o}}hl, Thomas and Mantel, Heiko and Nguyen, Toni and Wolf, Felix},
      month = nov,
      title = {Designing Efficient Parallel Software via Compositional Performance Modeling},
  booktitle = {Proc. of the Workshop on Programming and Performance Visualization Tools (ProTools), held in conjunction with the Supercomputing Conference (SC19), Denver, CO, USA},
       year = {2019},
      pages = {17--24},
       isbn = {978-1-7281-6026-9},
        doi = {10.1109/ProTools49597.2019.00008}
}

@INPROCEEDINGS{,
     author = {Schl{\"{u}}tter, Marc and Feld, Christian and Saviankou, Pavel and Knobloch, Michael and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd},
      month = sep,
      title = {SCIPHI Score-P and Cube Extensions for Intel Phi},
  booktitle = {Tools for High Performance Computing 2017},
       year = {2019},
      pages = {85-104},
  publisher = {Springer International Publishing},
    address = {Cham},
       isbn = {978-3-030-11987-4},
        doi = {10.1007/978-3-030-11987-4_6},
   abstract = {The Knights Landing processors offers unique features with regards to memory hierarchy and vectorization capabilities.
To improve tool support within these two areas, we present  extensions to the Score-P measurement infrastructure and the Cube report explorer.
With the Knights Landing edition, Intel introduced a new memory architecture, utilizing two  types of memory, MCDRAM and DDR4 SDRAM.
To assist the user in the decision where to place data structures, we introduce a MCDRAM candidate metric to the Cube report explorer.
In addition we track all MCDRAM allocations through the hbwmalloc interface, providing memory metrics like leaked memory or the high-water mark on a per-region basis, as already known for the ubiquitous malloc/free.
A Score-P metric plugin that records memory statistics via numastat on a per process level enables a timeline analysis using the Vampir toolset.
To get the best performance out of , the large vector processing units need to be utilized effectively.
The ratio between computation and data access and the vector processing unit (VPU) intensity are introduced as metrics to identify vectorization candidates on a per-region basis.
The Portable Hardware Locality  (hwloc) Broquedis et al. (hwloc: a generic framework for managing hardware affinities in hpc applications, 2010 [2])  library allows us to visualize the distribution of the  KNL-specific performance metrics within the Cube report explorer, taking the hardware topology consisting of processor tiles and cores into account.}
}

@ARTICLE{shudler_ea:tpds:2019,
    author = {Shudler, Sergei and Berens, Yannick and Calotoiu, Alexandru and Hoefler, Torsten and Strube, Alexandre and Wolf, Felix},
     month = aug,
     title = {Engineering Algorithms for Scalability through Continuous Validation of Performance Expectations},
   journal = {IEEE Transactions on Parallel and Distributed Systems},
    volume = {30},
    number = {8},
      year = {2019},
     pages = {1768--1785},
      issn = {1045-9219},
       doi = {10.1109/TPDS.2019.2896993}
}

@ARTICLE{shah_ea:scfi:2019,
    author = {Shah, Aamer and Kuo, Chihsong and Nomura, Akihiro and Matsuoka, Satoshi and Wolf, Felix},
     month = jul,
     title = {How File-access Patterns Influence the Degree of {I/O} Interference between Cluster Applications},
   journal = {Supercomputing Frontiers and Innovations},
    volume = {6},
    number = {2},
      year = {2019},
     pages = {29--55},
       doi = {10.14529/jsfi190203}
}

@PHDTHESIS{Hermanns:2018:Dissertation,
     author = {Hermanns, Marc-Andr{\'{e}}},
      title = {{U}nderstanding the formation of wait states in one-sided communication},
       type = {Dissertation},
     series = {Schriften des Forschungszentrums J{\"{u}}lich Reihe IAS},
     volume = {35},
       year = {2018},
      pages = {xiv, 144 pp},
  publisher = {Verlag Forschungszentrum J{\"{u}}lich GmbH Zentralbibliothek},
     school = {RWTH Aachen University},
    address = {J{\"{u}}lich},
       isbn = {978-3-95806-297-9},
        url = {http://juser.fz-juelich.de/record/844062},
        doi = {2128/17545},
   abstract = {Due to the available concurrency in modern-day supercomputers, the complexity of developing efficient parallel applications for these platforms has grown rapidly in the last years. Many applications use message passing for parallelization, offering three main communication paradigms: point-to-point, collective and one-sided communication. Each paradigm fits certain domains of algorithms and communication patterns best. The one-sided paradigm decouples communication and synchronization and allows a single process to define a complete communication. These are important features for runtime systems of new programming paradigms and state-of-the-art dynamic load-balancing strategies. In any process interaction, wait states can occur, where a process is waiting for another - idling - before it proceeds with its local computation. To eliminate such wait states, runtime and application developers alike need support in detecting and quantifying them and their root causes. However, tool support for identifying complex wait states in one-sided communication is scarce. This thesis contributes novel methods for the scalable detection and quantification of wait states in one-sided communication, the automatic identification of their root causes, and the assessment of optimization potential.The methods for wait-state detection and quantification, as introduced by B{\"{o}}hme et al. and extended by this thesis, build upon a parallel post-mortem traversal of process-local event traces, modeling an application's runtime behavior. Performance-relevant data is exchanged just in time on the recorded communication paths. Through the nature of one-sided communication, information on such communication paths is not available on all processes involved, impeding the use of this original approach for one-sided communication. The use of a novel high-level messaging framework enables the exchange of messages on the implicit communication paths of one-sided communication, while retaining the scalability of the original approach. This enables the identification of previously unstudied types of wait states unique to one-sided communication: lack of remote progress and resource contention. Beyond simple accounting of waiting time, other contributed methods allow pinpointing root causes of such wait states and identifying optimization potential in one-sided applications. Furthermore, they distinguish two fundamentally different classes of wait-state root causes: delays for direct process synchronization (similar to point-to-point and collective communication) and contention in case of lock-based process synchronization, whose resolution strategies are diametrically opposed to each other. Finally, the contributed methods enable the identification of the longest wait-state-free execution path (i.e., critical path) in parallel applications using one-sided communication. As only optimization of functions on the critical path will yield performance improvements, its identification is key to choosing promising optimization targets.All of these methods are integrated into the Scalasca performance toolset. Their scalability and effectiveness are demonstrated by evaluating a variety of applications using one-sided communication interfaces running in configurations with up to 65,536 processes.},
reportid={FZJ-2018-01571},
cin={JSC},
cid={I:(DE-Juel1)JSC-20090406},
pnm={511 - Computational Science and Mathematical Methods (POF3-511)},
pid={G:(DE-HGF)POF3-511},
typ={PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
urn={urn:nbn:de:0001-2018012504},
}

@INPROCEEDINGS{roth_ea:vpa:2018,
     author = {Roth, Philip C. and Huck, Kevin and Gopalakrishnan, Ganesh and Wolf, Felix},
      month = nov,
      title = {Using Deep Learning for Automated Communication Pattern Characterization: Little Steps and Big Challenges},
  booktitle = {Proc. of the 5th Workshop on Visual Performance Analysis (VPA), held in conjunction with the Supercomputing Conference (SC18), Dallas, TX, USA},
     series = {Lecture Notes in Computer Science},
     volume = {11027},
       year = {2018},
      pages = {265--272},
  publisher = {Springer},
       isbn = {978-3-030-17871-0},
        doi = {10.1007/978-3-030-17872-7_16}
}

@INPROCEEDINGS{shudler_ea:espt18,
     author = {Shudler, Sergei and Vrabec, Jadran and Wolf, Felix},
      month = nov,
      title = {Understanding the Scalability of Molecular Simulation using Empirical Performance Modeling},
  booktitle = {Proc. of the 7th Workshop on Extreme Scale Programming Tools (ESPT), held in conjunction with the Supercomputing Conference (SC18), Dallas, TX, USA},
     series = {Lecture Notes in Computer Science},
     volume = {11027},
       year = {2018},
      pages = {125--143},
  publisher = {Springer},
       isbn = {978-3-030-17871-0},
        doi = {10.1007/978-3-030-17872-7_8}
}

@INPROCEEDINGS{burger_lll:2018,
     author = {Burger, Michael and Bischof, Christian and Calotoiu, Alexandru and Wolf, Felix and Wunderer, Thomas and Buchmann, Johannes},
      month = oct,
      title = {Exploring the Performance Envelope of the LLL Algorithm},
  booktitle = {CSE 2018 - 21st IEEE International Conference of Computational Science and Engineering, Faculty of Automatic Control and Computers, University Politehnica of Bucharest, Romania},
       year = {2018},
      pages = {36--43},
  publisher = {IEEE},
       isbn = {978-1-5386-7649-3},
        doi = {10.1109/CSE.2018.00012}
}

@INPROCEEDINGS{calotoiu_codesign:2018,
     author = {Calotoiu, Alexandru and Graf, Alexander and Hoefler, Torsten and Lorenz, Daniel and Rinke, Sebastian and Wolf, Felix},
      month = sep,
      title = {Lightweight Requirements Engineering for Exascale Co-design},
  booktitle = {Proc. of the 2018 IEEE International Conference on Cluster Computing (CLUSTER), Belfast, UK},
       year = {2018},
      pages = {201--211},
  publisher = {IEEE},
       issn = {2168-9253},
       isbn = {978-1-5386-8319-4},
        doi = {10.1109/CLUSTER.2018.00038}
}

@INPROCEEDINGS{Hermanns_ea:2018:MPITevents,
     author = {Hermanns, Marc-Andr{\'{e}} and Hjelm, Nathan T. and Knobloch, Michael and Mohror, Kathryn and Schulz, Martin},
      month = sep,
      title = {Enabling callback-driven runtime introspection via {MPI_T}},
  booktitle = {25th European MPI Users' Group Meeting (EuroMPI'18), September 23-26, 2018, Barcelona, Spain},
       year = {2018},
  publisher = {ACM},
   location = {Barcelona, Spain},
    address = {New York, NY, USA},
        doi = {10.1145/3236367.3236370}
}

@INPROCEEDINGS{shah_ea:2018::europar,
     author = {Shah, Aamer and M{\"{u}}ller, Matthias S. and Wolf, Felix},
      month = aug,
      title = {Estimating the Impact of External Interference on Application Performance},
  booktitle = {Proc. of the 24th Euro-Par Conference, Turin, Italy},
     series = {Lecture Notes in Computer Science},
     volume = {11014},
       year = {2018},
      pages = {46--58},
  publisher = {Springer},
       isbn = {978-3-319-96982-4},
        doi = {10.1007/978-3-319-96983-1_4}
}

@ARTICLE{Sharples_ea:2018:run_control_framework,
    author = {Sharples, Wendy and Zhukov, Ilya and Geimer, Markus and G{\"{o}}rgen, Klaus and L{\"{u}}hrs, Sebastian and Breuer, Thomas and Naz, Bibi and Kulkarni, Ketan and Brdar, Slavko and Kollet, Stefan},
     month = jul,
     title = {A run control framework to streamline profiling, porting, and tuning simulation runs and provenance tracking of geoscientific applications},
   journal = {Geoscientific Model Development},
    volume = {11},
    number = {7},
      year = {2018},
     pages = {2875--2895},
       doi = {10.5194/gmd-11-2875-2018}
}

@PHDTHESIS{shudler:diss:2018,
    author = {Shudler, Sergei},
     month = jun,
     title = {Scalability Engineering for Parallel Programs Using Empirical Performance Models},
      year = {2018},
    school = {Technische Universit{\"{a}}t Darmstadt},
   address = {Darmstadt, Germany},
       url = {http://tuprints.ulb.tu-darmstadt.de/7471/}
}

@INCOLLECTION{Hermanns_ea:2017:RmaLockContention,
     author = {Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Mohr, Bernd and Wolf, Felix},
     editor = {Niethammer, Christoph and Gracia, Jos{\'{e}} and Hilbrich, Tobias and Kn{\"{u}}pfer, Andreas and Resch, Michael M. and Nagel, Wolfgang E.},
      title = {Trace-based Detection of Lock Contention in {MPI} One-Sided Communication},
  booktitle = {Tools for High Performance Computing 2016, Proc. of the 10th Parallel Tools Workshop, Stuttgart, Germany, October 2016},
       year = {2017},
      pages = {97--114},
  publisher = {Springer},
       isbn = {978-3-319-56701-3},
        url = {http://juser.fz-juelich.de/record/830159},
        doi = {10.1007/978-3-319-56702-0_6}
}

@ARTICLE{tafani_ea:2017:montblanc2,
    author = {Tafani, Daniele and Schl{\"{u}}tter, Marc and Geimer, Markus and Mohr, Bernd and Nachtmann, Mathias and Gracia, Jos{\'{e}}},
     title = {The Mont-Blanc Project: Second Phase successfully finished},
   journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
    volume = {15},
    number = {1},
      year = {2017},
     pages = {134--141},
       url = {http://inside.hlrs.de/assets/pdfs/inside_spring17.pdf}
}

@PHDTHESIS{calotoiu:diss:2017,
    author = {Calotoiu, Alexandru},
     month = oct,
     title = {Automatic Empirical Performance Modeling of Parallel Programs},
      type = {Dissertation},
      year = {2017},
    school = {Technische Universit{\"{a}}t Darmstadt},
   address = {Darmstadt, Germany},
       url = {https://tuprints.ulb.tu-darmstadt.de/7234/}
}

@INPROCEEDINGS{reisert_ea:europar:2017,
     author = {Reisert, Patrick and Calotoiu, Alexandru and Shudler, Sergei and Wolf, Felix},
      month = aug,
      title = {Following the Blind Seer -- Creating Better Performance Models Using Less Information},
  booktitle = {Proc. of the 23rd Euro-Par Conference, Santiago de Compostela, Spain},
     series = {Lecture Notes in Computer Science},
     volume = {10417},
       year = {2017},
      pages = {106--118},
  publisher = {Springer},
       isbn = {978-3-319-64202-4},
        doi = {10.1007/978-3-319-64203-1_8}
}

@INPROCEEDINGS{ilyas_ea:europar:2017,
     author = {Ilyas, Kashif and Calotoiu, Alexandru and Wolf, Felix},
      month = aug,
      title = {Off-Road Performance Modeling -- How to Deal with Segmented Data},
  booktitle = {Proc. of the 23rd Euro-Par Conference, Santiago de Compostela, Spain},
     series = {Lecture Notes in Computer Science},
     volume = {10417},
       year = {2017},
      pages = {36--48},
  publisher = {Springer},
       isbn = {978-3-319-64202-4},
        doi = {10.1007/978-3-319-64203-1_3}
}

@INPROCEEDINGS{lorenz_ea:iccs:2017,
     author = {Lorenz, Daniel and Feld, Christian},
      month = jun,
      title = {Scaling Score-P to the next level},
  booktitle = {Proc. of the International Converence of Computational Science Workshops},
       year = {2017},
      pages = {2180–-2189},
  publisher = {Elsevier},
   location = {Z{\"{u}}rich, Switzerland},
        doi = {10.1016/j.procs.2017.05.107}
}

@INPROCEEDINGS{Iliev_ea:2017:OnDemandTeams,
     author = {Iliev, Hristo and Hermanns, Marc-Andr{\'{e}} and G{\"{o}}bbert, Jens Henrik and Halver, Ren{\'{e}} and Terboven, Christian and Mohr, Bernd and M{\"{u}}ller, Matthias S.},
      month = mar,
      title = {Performance Optimization of Parallel Applications in Diverse On-Demand Development Teams},
  booktitle = {High-Performance Scientific Computing -- First JARA-HPC Symposium 2016, October 4--5, 2016, Aachen, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {10164},
       year = {2017},
      pages = {187--199},
  publisher = {Springer International Publishing},
   location = {Cham},
       isbn = {978-3-319-53861-7},
        url = {https://juser.fz-juelich.de/record/827935},
        doi = {10.1007/978-3-319-53862-4_16},
   abstract = {Current supercomputing platforms and scientific application codes have grown rapidly in complexity over the past years. Multi-scale, multi-domain simulations on one hand and deep hierarchies in large-scale computing platforms on the other make it exceedingly harder to map the former onto the latter and fully exploit the available computational power. The complexity of the software and hardware components involved calls for in-depth expertise that can only be met by diversity in the application development teams. With its model of simulation labs and cross-sectional groups, JARA-HPC enables such diverse teams to form on demand to solve concrete development problems. This work showcases the effectiveness of this model with two application case studies involving the JARA-HPC cross-sectional group “Parallel Efficiency” and simulation labs and domain-specific development teams. For one application, we show the results of a completed optimization and the estimated financial impact of the combined efforts. For the other application, we present results from an ongoing engagement, where we show how an on-demand team investigates the behavior of dynamic load balancing schemes for an MD particle simulation, leading to a better overall understanding of the application and revealing targets for further investigation.}
}

@INPROCEEDINGS{shudler_ea:ppopp:2017,
     author = {Shudler, Sergei and Calotoiu, Alexandru and Hoefler, Torsten and Wolf, Felix},
      month = feb,
      title = {Isoefficiency in Practice: Configuring and Understanding the Performance of Task-based Applications},
  booktitle = {Proc. of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), Austin, TX, USA},
       year = {2017},
      pages = {131--143},
  publisher = {ACM},
       isbn = {978-1-4503-4493-7},
        doi = {10.1145/3018743.3018770}
}

@INPROCEEDINGS{Vierjahn_ea:2016:DirectedVariance,
     author = {Vierjahn, Tom and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and M{\"{u}}ller, Matthias S. and Kuhlen, Torsten W. and Hentschel, Bernd},
      title = {Using Directed Variance to Identify Meaningful Views in Call-path Performance Profiles},
  booktitle = {Proceedings of the 3rd International Workshop on Visual Performance Analysis},
     series = {VPA '16},
       year = {2016},
      pages = {9--16},
  publisher = {IEEE Press},
   location = {Salt Lake City, Utah},
    address = {Piscataway, NJ, USA},
       isbn = {978-1-5090-5226-4},
        url = {https://doi.org/10.1109/VPA.2016.7},
        doi = {10.1109/vpa.2016.7},
numpages={8},
acmid={3019117},
}

@INPROCEEDINGS{calotoiu_ea:2016,
     author = {Calotoiu, Alexandru and Beckingsale, David and Earl, Christopher W. and Hoefler, Torsten and Karlin, Ian and Schulz, Martin and Wolf, Felix},
      month = sep,
      title = {Fast Multi-Parameter Performance Modeling},
  booktitle = {Proc. of the 2016 IEEE International Conference on Cluster Computing (CLUSTER), Taipei, Taiwan},
       year = {2016},
      pages = {172--181},
  publisher = {IEEE},
       issn = {2168-9253},
       isbn = {978-1-5090-3653-0},
        doi = {10.1109/CLUSTER.2016.57}
}

@ARTICLE{Boehme:2016:root_cause_wait_states,
    author = {B{\"{o}}hme, David and Geimer, Markus and Arnold, Lukas and Voigtl{\"{a}}nder, Felix and Wolf, Felix},
     month = jul,
     title = {Identifying the root causes of wait states in large-scale parallel applications},
   journal = {ACM Transactions on Parallel Computing},
    volume = {3},
    number = {2},
      year = {2016},
     pages = {Article No. 11, 24 pages},
      issn = {2329-4949},
       doi = {10.1145/2934661}
}

@INPROCEEDINGS{harlacher_ea:nic:2016,
        author = {Harlacher, Monika and Calotoiu, Alexandru and Dennis, John and Wolf, Felix},
        editor = {Binder, Kurt and M{\"{u}}ller, Marcus and Kremer, Manfred and Schnurpfeil, Alexander},
         month = feb,
         title = {Analysing the Scalability of Climate Codes Using New Features of Scalasca},
     booktitle = {Proc. of the John von Neumann Institute for Computing (NIC) Symposium 2016, Juelich, Germany},
        series = {NIC Series},
        volume = {48},
          year = {2016},
         pages = {343--352},
     publisher = {John von Neumann-Institut for Computing},
  organization = {Forschungszentrum J{\"{u}}lich},
          isbn = {978-3-95806-109-5}
}

@INPROCEEDINGS{zhukov_ea:2014:ScalascaV2,
     author = {Zhukov, Ilya and Feld, Christian and Geimer, Markus and Knobloch, Michael and Mohr, Bernd and Saviankou, Pavel},
      title = {Scalasca v2: Back to the Future},
  booktitle = {Proc. of Tools for High Performance Computing 2014},
       year = {2015},
      pages = {1-24},
  publisher = {Springer},
       isbn = {978-3-319-16011-5},
        doi = {10.1007/978-3-319-16012-2_1},
   abstract = {Scalasca is a well-established open-source toolset that supports the performance optimization of parallel programs by measuring and analyzing their runtime behavior. The analysis identifies potential performance bottlenecks – in particular those concerning communication and synchronization – and offers guidance in exploring their causes. The latest Scalasca v2 release series is based on the community instrumentation and measurement infrastructure Score-P, which is jointly developed by a consortium of partners from Germany and the US. This significantly improves interoperability with other performance analysis tool suites such as Vampir and TAU due to the usage of the two common data formats CUBE4 for profiles and the Open Trace Format 2 (OTF2) for event trace data. This paper will showcase recent as well as ongoing enhancements, such as support for additional platforms (K computer, Intel Xeon Phi) and programming models (POSIX threads, MPI-3, OpenMP4), and features like the critical-path analysis. It also summarizes the steps necessary for users to migrate from Scalasca v1 to Scalasca v2.}
}

@INPROCEEDINGS{vonRueden:2015:IdentifyingRelevantAndSimilarPerfData,
     author = {von R{\"{u}}den, Laura and Hermanns, Marc-Andr{\'{e}} and Behrisch, Michael and Keim, Daniel and Mohr, Bernd and Wolf, Felix},
      title = {Separating the Wheat from the Chaff: Identifying Relevant and Similar Performance Data with Visual Analytics},
  booktitle = {Proc. of the 2nd Workshop on Visual Performance Analysis (VPA), held in conjunction with the Supercomputing Conference (SC15), Austin, TX, USA},
       year = {2015},
      pages = {4:1--4:8},
  publisher = {ACM},
       isbn = {978-1-4503-4013-7},
        doi = {10.1145/2835238.2835242},
acmid={2835242},
articleno={4},
numpages={8},
}

@INPROCEEDINGS{Lorenz:2015:aggregation,
     author = {Lorenz, Daniel and Shudler, Sergei and Wolf, Felix},
      month = nov,
      title = {Preventing the explosion of exascale profile data with smart thread-level aggregation},
  booktitle = {Proc. of the 4th Workshop on Extreme Scale Programming Tools (ESPT), held in conjunction with the Supercomputing Conference (SC15), Austin, TX, USA},
       year = {2015},
      pages = {1--10},
  publisher = {ACM},
       isbn = {978-1-4503-3997-1},
        doi = {10.1145/2832106.2832107}
}

@INPROCEEDINGS{Vogel_ea:2015:10KPerfModels,
     author = {Vogel, Andreas and Calotoiu, Alexandru and Strube, Alexandre and Reiter, Sebastian and N{\"{a}}gel, Arne and Wolf, Felix and Wittum, Gabriel},
   keywords = {performance modeling, ug4},
      month = aug,
      title = {10,000 Performance Models per Minute - Scalability of the UG4 Simulation Framework},
  booktitle = {Proc. of the 21st Euro-Par Conference, Vienna, Austria},
     series = {Lecture Notes in Computer Science},
     volume = {9233},
       year = {2015},
      pages = {519--531},
  publisher = {Springer},
       isbn = {978-3-662-48095-3},
        doi = {10.1007/978-3-662-48096-0_40}
}

@INPROCEEDINGS{Iwainsky_ea:2015:HowManyThreads,
     author = {Iwainsky, Christian and Shudler, Sergei and Calotoiu, Alexandru and Strube, Alexandre and Knobloch, Michael and Bischof, Christian and Wolf, Felix},
      month = aug,
      title = {How Many Threads will be too Many? On the Scalability of OpenMP Implementations},
  booktitle = {Proc. of the 21st Euro-Par Conference, Vienna, Austria},
     series = {Lecture Notes in Computer Science},
     volume = {9233},
       year = {2015},
      pages = {451--463},
  publisher = {Springer},
       isbn = {978-3-662-48095-3},
        doi = {10.1007/978-3-662-48096-0_35}
}

@INPROCEEDINGS{shudler_ea:2015,
     author = {Shudler, Sergei and Calotoiu, Alexandru and Hoefler, Torsten and Strube, Alexandre and Wolf, Felix},
      month = jun,
      title = {Exascaling Your Library: Will Your Implementation Meet Your Expectations?},
  booktitle = {Proc. of the International Conference on Supercomputing (ICS), Newport Beach, CA, USA},
       year = {2015},
      pages = {165--175},
  publisher = {ACM},
       isbn = {978-1-4503-3559-1},
        doi = {10.1145/2751205.2751216}
}

@ARTICLE{,
    author = {Saviankou, Pavel and Knobloch, Michael and Visser, A. and Mohr, Bernd},
  keywords = {Call-tree profile, Derived metrics, DSL, GUI plugins, Performance Analysis},
     month = jun,
     title = {Cube v4: From Performance Report Explorer to Performance Analysis Tool},
   journal = {Procedia Computer Science},
    volume = {51},
      year = {2015},
     pages = {1343--1352},
      issn = {1877-0509},
       doi = {10.1016/j.procs.2015.05.320},
  abstract = {Cube v3 has been a powerful tool to examine reports of the parallel performance tool Scalasca, but was basically unable to perform analyses on its own. With Cube v4, we addressed several shortcomings of Cube v3. We generalized the Cube data model, extended the list of supported data types, and allow operations with nontrivial algebras, e.g. for performance models or statistical data. Additionally, we introduced two major new features that greatly enhance the performance analysis features of Cube: Derived metrics and GUI plugins. Derived metrics can be used to create and manipulate metrics directly within the GUI, using a powerful domain-specific language called CubePL. Cube GUI plugins allow the development of novel performance analysis techniques and visualizations based on Cube data without changing the source code of the Cube GUI.}
}

@INPROCEEDINGS{jiang14,
     author = {Jiang, Jie and Philippen, Peter and Knobloch, Michael and Mohr, Bernd},
   keywords = {Blue Gene/Q, parallel programming, Performance Analysis, Speculative Execution, Trans- actional Memory},
      title = {Performance Measurement and Analysis of Transactional Memory and Speculative Execution on IBM Blue Gene/Q},
  booktitle = {Proceedings of Euro-Par 2014 Parallel Processing},
     series = {Lecture Notes in Computer Science},
     volume = {8632},
       year = {2014},
      pages = {26-37},
  publisher = {Springer International Publishing},
   location = {Switzerland},
       issn = {0302-9743},
       isbn = {978-3-319-09873-9},
        url = {http://link.springer.com/chapter/10.1007/978-3-319-09873-9_3},
        doi = {10.1007/978-3-319-09873-9_3},
   abstract = {The core count of modern processors is steadily increasing, forcing programmers to use more concurrent threads or tasks to effectively use the available hardware. This in turn makes it increasingly challenging to achieve correct and efficient thread synchronization. To support the programmer in this task, IBM introduced hardware transactional memory (TM) and speculative execution (SE) in their Blue Gene/Q system with its 16-core processor, which permits to run 64 simultaneous hardware threads in SMT mode. TM and SE allow for parallelization when race conditions may happen, however upon their detection the respective parts of the execution are rolled back and re-executed serially. This incurs some overhead and therefore usage must be well justified. In this paper, we describe extensions to the community instrumentation and measurement infrastructure Score-P, allowing developers to instrument, measure, and analyze applications. To our knowledge, this is the first integrated performance tool framework allowing to analyze TM/SE programs. We demonstrate its usefulness and effectiveness by describing experiments with benchmarks and a real-world application.}
}

@ARTICLE{Feld_ea:2014:RAPID,
    author = {R{\"{o}}ssel, Christian and Mohr, Bernd and Geimer, Markus and Becker, Daniel},
     title = {Successful Technology Transfer with {Siemens} -- The {RAPID} Project},
   journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
    volume = {12},
    number = {3},
      year = {2014},
     pages = {72--75},
       url = {http://inside.hlrs.de/assets/pdfs/inside_autumn14.pdf}
}

@ARTICLE{Gasper_ea:2014_TerrSysMP,
    author = {Gasper, Fabian and G{\"{o}}rgen, Klaus and Shrestha, Prabhakar and Sulis, Mauro and Rihani, Jehan and Geimer, Markus and Kollet, Stefan},
     month = oct,
     title = {Implementation and scaling of the fully coupled Terrestrial Systems Modeling Platform ({TerrSysMP} v1.0) in a massively parallel supercomputing environment -- a case study on {JUQUEEN} ({IBM Blue Gene/Q})},
   journal = {Geoscientific Model Development},
    volume = {7},
    number = {5},
      year = {2014},
     pages = {2531--2543},
       url = {http://www.geosci-model-dev.net/7/2531/2014/},
       doi = {10.5194/gmd-7-2531-2014}
}

@INPROCEEDINGS{Lorenz_ea:2014:comparison_OPARI2_OpenMP,
     author = {Lorenz, Daniel and Dietrich, Robert and Tsch{\"{u}}ter, Ronny and Wolf, Felix},
      month = sep,
      title = {A comparison between {OPARI2} and the {OpenMP} tools interface in the context of {Score-P}},
  booktitle = {Proc. of the 10th International Workshop on OpenMP (IWOMP), Salvador, Brazil, September 2014},
     series = {LNCS},
     volume = {8766},
       year = {2014},
      pages = {161--172},
  publisher = {Springer},
   location = {Salvador, Brazil},
       isbn = {978-3-319-11453-8},
        doi = {10.1007/978-3-319-11454-5_12}
}

@INPROCEEDINGS{Mao_ea:2014:CatchingIdlersWithEase,
     author = {Mao, Gouyong and B{\"{o}}hme, David and Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Lorenz, Daniel and Wolf, Felix},
   keywords = {MPI, Performance Analysis, profiling, Score-P, wait states},
      month = sep,
      title = {Catching Idlers with Ease: {A} Lightweight Wait-State Profiler for {MPI} Programs},
  booktitle = {EuroMPI '14: Proc. of the 21th European MPI Users' Group Meeting, Kyoto, Japan},
       year = {2014},
      pages = {103--108},
  publisher = {ACM},
   location = {New York, NY, USA},
       isbn = {978-1-4503-2875-3},
        doi = {10.1145/2642769.2642783}
}

@INPROCEEDINGS{kuo_etal:2014:FileAccessPatternsInterference,
     author = {Kuo, Chihsong and Shah, Aamer and Nomura, Akihiro and Matsuoka, Satoshi and Wolf, Felix},
      month = sep,
      title = {How File Access Patterns Influence Interference Among Cluster Applications},
  booktitle = {Proc. of the IEEE International Conference on Cluster Computing (CLUSTER), Madrid, Spain},
       year = {2014},
      pages = {1--8},
  publisher = {IEEE},
       issn = {1552-5244},
       isbn = {978-1-4799-5548-0},
        doi = {10.1109/CLUSTER.2014.6968743}
}

@INPROCEEDINGS{Wolf_ea:2014:Catwalk,
     author = {Wolf, Felix and Bischof, Christian and Hoefler, Torsten and Mohr, Bernd and Wittum, Gabriel and Calotoiu, Alexandru and Iwainsky, Christian and Strube, Alexandre and Vogel, Andreas},
     editor = {Lopez, Luis},
      month = sep,
      title = {Catwalk: A Quick Development Path for Performance Models},
  booktitle = {Euro-Par 2014: Parallel Processing Workshops},
     series = {Lecture Notes in Computer Science},
     volume = {8805, 8806},
       year = {2014},
  publisher = {Springer},
       isbn = {978-3-319-14312-5},
        doi = {10.1007/978-3-319-14313-2_50}
}

@INPROCEEDINGS{Calotoiu_ea:2014:performance_models,
     author = {Calotoiu, Alexandru and Hoefler, Torsten and Wolf, Felix},
      month = aug,
      title = {Mass-producing Insightful Performance Models},
  booktitle = {Workshop on Modeling & Simulation of Systems and Applications, University of Washington},
       year = {2014},
    address = {Seattle, Washington},
        url = {http://hpc.pnl.gov/modsim/2014/index.shtml},
   abstract = {Many parallel applications suffer from latent performance
  limitations that may prevent them from scaling to larger machine
  sizes. Often, such scalability bugs manifest themselves only when an
  attempt to scale the code is actually being made -a point where
  remediation can be difficult. However, creating performance models
  that would allow such issues to be pinpointed earlier is so
  laborious that application developers attempt it at most for a few
  selected kernels, running the risk of missing harmful
  bottlenecks. By automatically generating empirical performance
  models for each function in the program, we make this powerful
  methodology easier to use and expand its coverage. This article
  gives an overview of the method and assesses its potential.}
}

@INPROCEEDINGS{,
     author = {Schl{\"{u}}tter, Marc and Philippen, Peter and Morin, Laurent and Geimer, Markus and Mohr, Bernd},
     editor = {Bader, Michael and Bode, Arndt and Bungartz, Hans-Joachim and Gerndt, Michael and Joubert, Gerhard R. and Peters, Frans J.},
   keywords = {accelerator, CUBE, GPGPU, GPU, OpenACC, optimisation, performance, PHMPP, profiling, Score-P, tools, tracing},
      month = mar,
      title = {Profiling Hybrid HMPP Applications with Score-P on Heterogeneous Hardware},
  booktitle = {Parallel Computing: Accelerating Computational Science and Engineering (CSE)},
     series = {Advances in Parallel Computing},
     volume = {25},
       year = {2014},
      pages = {773 - 782},
  publisher = {IOS Press},
       isbn = {978-1-61499-381-0},
        url = {http://www.ebooks.iospress.nl/volumearticle/35952},
        doi = {10.3233/978-1-61499-381-0-773},
   abstract = {In heterogeneous environments with multi-core systems and accelerators, programming and optimizing large parallel applications turns into a time-intensive and hardware-dependent challenge. To assist application developers in this process, a number of tools and high-level compilers have been developed. Directive-based programming models such as HMPP and OpenACC provide abstractions over low-level GPU programming models, such as CUDA or OpenCL. The compilers developed by CAPS automatically transform the pragma-annotated application code into low-level code, thereby allowing the parallelization and optimization for a given accelerator hardware. To analyze the performance of parallel applications, multiple partners in Germany and the US jointly develop the community measurement infrastructure Score-P. Score-P gathers performance execution profiles, which can be presented and analyzed within the CUBE result browser, and collects detailed event traces to be processed by post-mortem analysis tools such as Scalasca and Vampir.

In this paper we present the integration and combined use of Score-P and the CAPS compilers as one approach to efficiently parallelize and optimize codes. Specifically, we describe the PHMPP profiling interface, it's implementation in Score-P, and the presentation of preliminary results in CUBE.}
}

@INPROCEEDINGS{,
     author = {Jaeger, Julien and Philippen, Peter and Petit, Eric and Charif Rubial, Andres and R{\"{o}}ssel, Christian and Jalby, William and Mohr, Bernd},
     editor = {Bader, Michael and Bode, Arndt and Bungartz, Hans-Joachim and Gerndt, Michael and Joubert, Gerhard R. and Peters, Frans J.},
   keywords = {analysis, Binary instrumentation, MAQAO, measurement, OpenMP, performance, profile, Score-P, trace},
      month = mar,
      title = {Binary Instrumentation for Scalable Performance Measurement of OpenMP Applications},
  booktitle = {Parallel Computing: Accelerating Computational Science and Engineering (CSE)},
     series = {Advances in Parallel Computing},
     volume = {25},
       year = {2014},
      pages = {783-792},
  publisher = {IOS Press},
   location = {Amsterdam, Berlin, Tokyo, Washington D.C.},
       isbn = {978-1-61499-380-3},
        url = {http://ebooks.iospress.nl/publication/35953},
        doi = {10.3233/978-1-61499-381-0-783},
   abstract = {In this paper we present a binary instrumentation methodology to monitor runtime events. We demonstrate our approach on OpenMP constructs for the Intel and GNU compilers. A binary-level static analysis detects the compiler patterns and the runtime function calls corresponding to OpenMP regions. To this effect we integrate the software tool MAQAO with the scalable measurement infrastructure Score-P. We design a new interface and modify both tools to support the new events. The main advantages of using binary instrumentation are the possibility to retrieve implicit runtime events, to instrument without recompilation, to be independent from the language, and not to interact with compiler optimization. Our validation experiments and first results shows that binary instrumentation has not introduced any additional overhead.}
}

@PHDTHESIS{boehme:2014:dissertation,
    author = {B{\"{o}}hme, David},
     month = feb,
     title = {Characterizing Load and Communication Imbalance in Parallel Applications},
      type = {Dissertation},
      year = {2014},
    school = {RWTH Aachen University},
   address = {volume 23 of IAS Series, Forschungszentrum J\"ulich},
      note = {{ISBN} 978-3-89336-940-9},
       url = {http://darwin.bth.rwth-aachen.de/opus3/volltexte/2014/4986/},
       doi = {2128/5909},
  abstract = {The amount of parallelism in modern supercomputers currently grows from generation to gen- eration, and is expected to reach orders of millions of processor cores in a single system in the near future. Further application performance improvements therefore depend to a large extend on software-managed parallelism: in particular, the software must organize data ex- change between processing elements efficiently and optimally distribute the workload between them. Performance analysis tools help developers of parallel applications to evaluate and opti- mize the parallel efficiency of their programs by pinpointing specific performance bottlenecks. However, existing tools are often incapable of identifying complex imbalance patterns and de- termining their performance impact reliably. This dissertation presents two novel methods to automatically extract imbalance-related performance problems from event traces generated by MPI programs and intuitively guide the performance analyst to inefficiencies whose optimiza- tion promise the highest benefit.
The first method, the delay analysis, identifies the root causes of wait states. A delay occurs when a program activity needs more time on one process than on another, which leads to the formation of wait states at a subsequent synchronization point. Wait states, which are intervals through which a process is idle while waiting for the delayed process, are the primary symptom of load imbalance in parallel programs. While wait states themselves are easy to detect, the potentially large temporal and spatial distance between wait states and the delays causing them complicates the identification of wait-state root causes. The delay analysis closes this gap, accounting for both short-term and long-term effects. To this end, the delay analysis comprises two contributions of this dissertation: (1) a cost model and terminology to describe the severity of a delay in terms of the overall waiting time it causes; and (2) a scalable algorithm to identify the locations of delays and determine their cost.
The second new analysis method is based on the detection of the critical path. In contrast to the delay analysis, which characterizes the formation of wait states, this critical-path analysis determines the effect of imbalance on program runtime. The critical path is the longest ex- ecution path in a parallel program without wait states: optimizing an activity on the critical path will reduce the programs run time. Comparing the duration of activities on the critical path with their duration on each process yields a set of novel, compact performance indica- tors. These indicators allow users to evaluate load balance, identify performance bottlenecks, and determine the performance impact of load imbalance at first glance by providing an intu- itive understanding of complex performance phenomena. Unlike existing statistics-based load balance metrics, these indicators are applicable to both SPMD and MPMD-style programs.
Both analysis methods leverage the scalable event-trace analysis technique employed by the Scalasca toolset: by replaying event traces in parallel, the bottleneck search algorithms can harness the distributed memory and computational resources of the target system for the anal- ysis, allowing them to process even large-scale program runs. The scalability and performance insight that the novel analysis approaches provide are demonstrated by evaluating a variety of real-world HPC codes in configurations with up to 262,144 processor cores.}
}

@INPROCEEDINGS{zhukov_wylie:PROPER13,
     author = {Zhukov, Ilya and Wylie, Brian J. N.},
      month = jan,
      title = {Assessing Measurement and Analysis Performance and Scalability of Scalasca 2.0},
  booktitle = {Proc. of the Euro-Par 2013: Parallel Processing Workshops},
     series = {Lecture Notes in Computer Science},
     volume = {8374},
       year = {2014},
      pages = {627-636},
  publisher = {Springer},
        doi = {10.1007/978-3-642-54420-0_61},
   abstract = {The Scalasca toolset was developed to provide highly scalable performance measurement and analysis of scientific applications on current HPC platforms, including leadership systems such as IBM BlueGene/Q and more traditional Linux clusters. Its primary focus is support for C/C++/Fortran applications using MPI and OpenMP, and mixed-mode combinations thereof, offering detailed call-path profiles for each process and thread produced by runtime summarization or augmented with wait-state analysis of event traces. A new generation of Scalasca (2.0) uses the community-developed infrastructure comprising of Score-P and associated components, while continuing to provide the previous functionality. By comparing the new version of Scalasca with its predecessor, using the applications from the NPB3.3-MZ-MPI benchmark suite, we validate core functionality and assess overheads and scalability. Although adequate for general use, various aspects are identified for further improvement, particularly for larger scales.}
}

@INCOLLECTION{Knuepfer_ea:2013:OTF2Rma,
     author = {Kn{\"{u}}pfer, Andreas and Dietrich, Robert and Doleschal, Jens and Geimer, Markus and Hermanns, Marc-Andr{\'{e}} and R{\"{o}}ssel, Christian and Tsch{\"{u}}ter, Ronny and Wesarg, Bert and Wolf, Felix},
     editor = {Cheptsov, Alexey and Brinkmann, Steffen and Gracia, Jos{\'{e}} and Resch, Michael M. and Nagel, Wolfgang E.},
      title = {Generic Support for Remote Memory Access Operations in {Score-P} and {OTF2}},
  booktitle = {Tools for High Performance Computing 2012, Proc. of the 6th Parallel Tools Workshop, Stuttgart, Germany, September 2012},
       year = {2013},
      pages = {57--74},
  publisher = {Springer},
       isbn = {978-3-642-37348-0},
        doi = {10.1007/978-3-642-37349-7_5},
language={English},
}

@INCOLLECTION{Lorenz_ea:2013:ExtendingScalasca,
     author = {Lorenz, Daniel and B{\"{o}}hme, David and Mohr, Bernd and Strube, Alexandre and Szebenyi, Zolt{\'{a}}n},
     editor = {Cheptsov, Alexey and Brinkmann, Steffen and Gracia, Jos{\'{e}} and Resch, Michael M. and Nagel, Wolfgang E.},
      title = {Extending {Scalasca’s} Analysis Features},
  booktitle = {Tools for High Performance Computing 2012},
       year = {2013},
      pages = {115--126},
  publisher = {Springer Berlin Heidelberg},
       isbn = {978-3-642-37348-0},
        doi = {10.1007/978-3-642-37349-7_8},
language={English},
}

@INPROCEEDINGS{,
     author = {Eichenberger, Alexandre E. and Mellor-Crummey, John M. and Schulz, Martin and Wong, Michael and Copty, Nawal and DelSignore, John and Dietrich, Robert and Liu, Xu and Loh, Eugene and Lorenz, Daniel},
      title = {OMPT: OpenMP Tools Application Programming Interfaces for Performance Analysis},
  booktitle = {Proc. of the 9th International Workshop on OpenMP (IWOMP), Canberra, Australia},
     series = {LNCS},
     number = {8122},
       year = {2013},
      pages = {171--185},
  publisher = {Springer},
    address = {Berlin / Heidelberg},
        doi = {10.1007/978-3-642-40698-0_13}
}

@INCOLLECTION{Mohr_ea:2013:HOPSA_Workflow,
     author = {Mohr, Bernd and Voevodin, Vladimir and Gim{\'{e}}nez, Judit and Hagersten, Erik and Kn{\"{u}}pfer, Andreas and Nikitenko, DmitryA. and Nilsson, Mats and Servat, Harald and Shah, Aamer and Winkler, Frank and Wolf, Felix and Zhukov, Ilya},
     editor = {Cheptsov, Alexey and Brinkmann, Steffen and Gracia, Jos{\'{e}} and Resch, Michael M. and Nagel, Wolfgang E.},
      title = {The {HOPSA} Workflow and Tools},
  booktitle = {Tools for High Performance Computing 2012, Proc. of the 6th Parallel Tools Workshop, Stuttgart, Germany, September 2012},
       year = {2013},
      pages = {127--146},
  publisher = {Springer},
       isbn = {978-3-642-37348-0},
        doi = {10.1007/978-3-642-37349-7_9},
language={English},
}

@INPROCEEDINGS{calotoiu_ea:2013:modeling,
     author = {Calotoiu, Alexandru and Hoefler, Torsten and Poke, Marius and Wolf, Felix},
      month = {November},
      title = {Using Automated Performance Modeling to Find Scalability Bugs in Complex Codes},
  booktitle = {Proc. of the ACM/IEEE Conference on Supercomputing (SC13), Denver, CO, USA},
       year = {2013},
      pages = {1--12},
  publisher = {ACM},
       isbn = {978-1-4503-2378-9},
        doi = {10.1145/2503210.2503277}
}

@INPROCEEDINGS{Hermanns_ea:2013:UnderstandingRmaWaitStates,
     author = {Hermanns, Marc-Andr{\'{e}} and Miklosch, Manfred and B{\"{o}}hme, David and Wolf, Felix},
   keywords = {critical path, one-sided communication, Performance Analysis, performance optimization, root cause},
      month = sep,
      title = {Understanding the formation of wait states in applications with one-sided communication},
  booktitle = {EuroMPI '13: Proc. of the 20th European MPI Users' Group Meeting, Madrid, Spain, September 15--18, 2013},
       year = {2013},
      pages = {73--78},
  publisher = {ACM},
   location = {Madrid, Spain},
    address = {New York, NY, USA},
       isbn = {978-1-4503-1903-4},
        doi = {10.1145/2488551.2488569},
acmid=2488569
}

@INPROCEEDINGS{shah_ea:2013:ApplicationInterference,
     author = {Shah, Aamer and Wolf, Felix and Zhumatiy, Sergey and Voevodin, Vladimir},
      month = {September},
      title = {Capturing inter-application interference on clusters},
  booktitle = {Proc. of the IEEE International Conference on Cluster Computing (CLUSTER), Indianapolis, IN, USA},
       year = {2013},
      pages = {1--5},
  publisher = {IEEE},
       issn = {1552-5244},
       isbn = {978-1-4799-0898-1},
        doi = {10.1109/cluster.2013.6702665}
}

@INPROCEEDINGS{wylie_frings:xsede13:MIC,
     author = {Wylie, Brian J. N. and Frings, Wolfgang},
      month = jul,
      title = {Scalasca support for MPI+OpenMP parallel applications on large-scale HPC systems based on Intel Xeon Phi},
  booktitle = {Proc. XSEDE'13 Conference on Extreme Science and Engineering Discovery Environment: Gateway to Discovery (San Diego, CA, USA)},
       year = {2013},
  publisher = {ACM},
       isbn = {978-1-4503-2170-9},
        doi = {10.1145/2484762.2484777},
   abstract = {Intel Xeon Phi coprocessors based on the Many Integrated Core (MIC) architecture are starting to appear in HPC systems, with Stampede being a prominent example available within the XSEDE cyber-infrastructure. Porting MPI and OpenMP applications to such systems is often no more than simple recompilation, however, execution performance needs to be carefully analyzed and tuned to effectively exploit their unique capabilities. For performance measurement and analysis tools, the variety of execution modes need to be supported in a consistent and convenient manner, and especially execution configurations involving large numbers of compute nodes each with several multicore host processors and many-core coprocessors. Early experience using the open-source Scalasca toolset for runtime summarization and automatic trace analysis with the NPB BT-MZ MPI+OpenMP parallel application on Stampede is reported, along with discussion of on-going and future work.}
}

@ARTICLE{becker_ea:2011:scope,
     author = {Becker, Daniel and Geimer, Markus and Rabenseifner, Rolf and Wolf, Felix},
      month = mar,
      title = {Extending the scope of the controlled logical clock},
    journal = {Cluster Computing},
     volume = {16},
     number = {1},
       year = {2013},
      pages = {171--189},
  publisher = {Springer},
       issn = {1386-7857},
        doi = {10.1007/s10586-011-0181-8}
}

@ARTICLE{Hermanns_ea:2012:PassiveTargetInfrastructure,
    author = {Hermanns, Marc-Andr{\'{e}} and Krishnamoorthy, Sriram and Wolf, Felix},
  keywords = {Remote Memory Access},
     month = mar,
     title = {A scalable infrastructure for the performance analysis of passive target synchronization},
   journal = {Parallel Computing},
    volume = {39},
    number = {3},
      year = {2013},
     pages = {132--145},
      issn = {0167-8191},
       doi = {10.1016/j.parco.2012.09.002},
  abstract = {Partitioned global address space (PGAS) languages combine the convenient abstraction of shared memory with the notion of affinity, extending multi-threaded programming to large-scale systems with physically distributed memory. However, in spite of their obvious advantages, PGAS languages still lack appropriate tool support for performance analysis, one of the reasons why their adoption is still in its infancy. Some of the performance problems for which tool support is needed occur at the level of the underlying one-sided communication substrate, such as the Aggregate Remote Memory Copy Interface (ARMCI). One such example is the waiting time in situations where asynchronous data transfers cannot be completed without software intervention at the target side. This is not uncommon on systems with reduced operating-system kernels such as IBM Blue Gene/P where the use of progress threads would double the number of cores necessary to run an application. In this paper, we present an extension of the Scalasca trace-analysis infrastructure aimed at the identification and quantification of progress-related waiting times at larger scales. We demonstrate its utility and scalability using a benchmark running with up to 32,768 processes.},
publisher={Elsevier}
}

@INPROCEEDINGS{geimer_ea:2012:hierarchical_unify_binary_cube,
     author = {Geimer, Markus and Saviankou, Pavel and Strube, Alexandre and Szebenyi, Zolt{\'{a}}n and Wolf, Felix and Wylie, Brian J. N.},
      title = {Further improving the scalability of the {Scalasca} toolset},
  booktitle = {Proc. of PARA 2010: State of the Art in Scientific and Parallel Computing, Part II: Minisymposium Scalable tools for High Performance Computing, Reykjavik, Iceland, June 6--9 2010},
     series = {Lecture Notes in Computer Science},
     volume = {7134},
       year = {2012},
      pages = {463--474},
  publisher = {Springer},
       isbn = {978-3-642-28144-0},
        doi = {10.1007/978-3-642-28145-7_45}
}

@INPROCEEDINGS{an_mey_ea:2010:cihpc,
        author = {an Mey, Dieter and Biersdorff, Scott and Bischof, Christian and Diethelm, Kai and Eschweiler, Dominic and Gerndt, Michael and Kn{\"{u}}pfer, Andreas and Lorenz, Daniel and Malony, Allen D. and Nagel, Wolfgang E. and Oleynik, Yury and R{\"{o}}ssel, Christian and Saviankou, Pavel and Schmidl, Dirk and Shende, Sameer S. and Wagner, Michael and Wesarg, Bert and Wolf, Felix},
         title = {{Score-P}: {A} Unified Performance Measurement System for Petascale Applications},
     booktitle = {Proc. of the CiHPC: Competence in High Performance Computing, HPC Status Konferenz der Gau{\ss}-Allianz e.V., Schwetzingen, Germany, June 2010},
          year = {2012},
         pages = {85--97},
     publisher = {Springer},
  organization = {Gau{\ss}-Allianz},
          isbn = {978-3-642-24025-6},
           doi = {10.1007/978-3-642-24025-6_8}
}

@INPROCEEDINGS{Eschweiler_ea:2012:otf2_format_libraries,
     author = {Eschweiler, Dominic and Wagner, Michael and Geimer, Markus and Kn{\"{u}}pfer, Andreas and Nagel, Wolfgang E. and Wolf, Felix},
      title = {{O}pen {T}race {F}ormat 2 - {T}he Next Generation of Scalable Trace Formats and Support Libraries},
  booktitle = {Proc. of the Intl. Conference on Parallel Computing (ParCo), Ghent, Belgium, August 30 -- September 2 2011},
     series = {Advances in Parallel Computing},
     volume = {22},
       year = {2012},
      pages = {481--490},
  publisher = {IOS Press},
       isbn = {978-1-61499-040-6},
        doi = {10.3233/978-1-61499-041-3-481}
}

@INPROCEEDINGS{andersson_wylie:para2010:GemsFDTD,
     author = {Andersson, Ulf and Wylie, Brian J. N.},
      title = {Performance engineering of {GemsFDTD} computational electromagnetics solver},
  booktitle = {Proc. of PARA 2010:State of the Art in Scientific and Parallel Computing, Reykjav{\'{\i}}k, Iceland, Part I},
     series = {Lecture Notes in Computer Science},
     volume = {7133},
       year = {2012},
      pages = {314-324},
  publisher = {Springer},
        doi = {10.1007/978-3-642-28151-8_31},
   abstract = {Since modern high-performance computer systems consist of many hardware components and software layers, they present severe challenges for application developers who are primarily domain scientists and not experts with continually evolving hardware and system software. Effective tools for performance analysis are therefore decisive when developing performant scalable parallel applications. Such tools must be convenient to employ in the application development process and analysis must be both clear to interpret and yet comprehensive in the level of detail provided. We describe how the Scalasca toolset was applied in engineering the GemsFDTD computational electromagnetics solver, and the dramatic performance and scalability gains thereby achieved.}
}

@INCOLLECTION{knuepfer:2011:scorep,
     author = {Kn{\"{u}}pfer, Andreas and R{\"{o}}ssel, Christian and an Mey, Dieter and Biersdorff, Scott and Diethelm, Kai and Eschweiler, Dominic and Geimer, Markus and Gerndt, Michael and Lorenz, Daniel and Malony, Allen D. and Nagel, Wolfgang E. and Oleynik, Yury and Philippen, Peter and Saviankou, Pavel and Schmidl, Dirk and Shende, Sameer S. and Tsch{\"{u}}ter, Ronny and Wagner, Michael and Wesarg, Bert and Wolf, Felix},
      title = {{Score-P} -- {A} Joint Performance Measurement Run-Time Infrastructure for {Periscope}, {Scalasca}, {TAU}, and {Vampir}},
  booktitle = {Tools for High Performance Computing 2011, Proc. of the 5th Parallel Tools Workshop, Dresden, Germany, September 2011},
       year = {2012},
      pages = {79--91},
  publisher = {Springer},
       isbn = {978-3-642-31476-6},
        doi = {10.1007/978-3-642-31476-6_7}
}

@PHDTHESIS{szebenyi:2012:dissertation,
    author = {Szebenyi, Zolt{\'{a}}n},
     title = {Capturing Parallel Performance Dynamics},
      year = {2012},
    school = {RWTH Aachen University},
   address = {volume 12 of IAS Series, Forschungszentrum J\"ulich},
      note = {{ISBN} 978-3-89336-798-6},
       url = {http://hdl.handle.net/2128/4603},
  abstract = {Supercomputers play a key role in countless areas of science and engineering, enabling the development of new insights and technological advances never possible before. The strategic importance and ever-growing complexity of the efficient usage of supercomputing resources makes application performance analysis invaluable for the development of parallel codes. Runtime call-path profiling is a conventional, well-known method used for collecting summary statistics of an execution such as the time spent in different call paths of the code. However, these kinds of measurements only give the user a summary overview of the entire execution, without regard to changes in performance behavior over time. The possible causes of temporal changes are quite numerous, ranging from adaptive workload balancing through periodically executed extra work or distinct computational phases to system noise. As present day scientific applications tend to be run for extended periods of time, understanding the patterns and trends in the performance data along the time axis becomes crucial.}
}

@ARTICLE{roessel_ea:2012:lmac,
    author = {R{\"{o}}ssel, Christian and Mohr, Bernd and Gerndt, Michael and Wolf, Felix},
     title = {Performance Dynamics of Massively Parallel Codes},
   journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
    volume = {10},
    number = {2},
      year = {2012},
     pages = {72--73},
       url = {http://inside.hlrs.de/_old/htm/Edition_02_12/article_19.html}
}

@INPROCEEDINGS{Boehme_ea:2012:EEFSW,
     author = {B{\"{o}}hme, David and Hermanns, Marc-Andr{\'{e}} and Wolf, Felix},
      title = {Scalasca},
  booktitle = {Entwicklung und Evolution von Forschungssoftware, Rolduc, November 2011},
     series = {Aachener Informatik-Berichte, Software Engineering},
     volume = {14},
       year = {2012},
      pages = {43--48},
  publisher = {Shaker},
   location = {Aachen},
   crossref = {EEFSW:2012}
}



crossreferenced publications: 
@PROCEEDINGS{EEFSW:2012,
     editor = {Rumpe, Bernhard and Lichter, Horst},
      title = {Entwicklung und Evolution von Forschungssoftware},
  booktitle = {Entwicklung und Evolution von Forschungssoftware, Rolduc, November 2011},
     series = {Aachener Informatik-Berichte, Software Engineering},
     volume = {14},
       year = {2012},
  publisher = {Shaker},
       isbn = {978-3-8440-1600-0}
}

@INPROCEEDINGS{Roessel_ea:2012:EEFSW,
     author = {R{\"{o}}ssel, Christian and Mohr, Bernd and Wolf, Felix},
      title = {Score-{P}},
  booktitle = {Entwicklung und Evolution von Forschungssoftware, Rolduc, Niederlande, November 2011},
     series = {Aachener Informatik-Berichte, Software Engineering},
     volume = {14},
       year = {2012},
      pages = {23--30},
  publisher = {Shaker}
}

@INPROCEEDINGS{Lorenz_ea:2012:OpenMPProfiling,
     author = {Lorenz, Daniel and Philippen, Peter and Schmidl, Dirk and Wolf, Felix},
      month = sep,
      title = {Profiling of {OpenMP} tasks with {Score-P}},
  booktitle = {Proc. of the 41st International Conference on Parallel Processing Workshops (ICPPW), Workshop on Parallel Software Tools and Tool Infrastructures (PSTI)},
       year = {2012},
      pages = {444--453},
   location = {Pittsburgh, PA, USA},
       issn = {0190-3918},
       isbn = {978-1-4673-2509-7},
        doi = {10.1109/ICPPW.2012.62}
}

@ARTICLE{Hermanns_ea:2012:Mpi2RmaAnalysis,
    author = {Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Mohr, Bernd and Wolf, Felix},
     month = aug,
     title = {Scalable detection of {MPI}-2 remote memory access inefficiency patterns},
   journal = {Intl. Journal of High Performance Computing Applications (IJHPCA)},
    volume = {26},
    number = {3},
      year = {2012},
     pages = {227--236},
       doi = {10.1177/1094342011406758},
  abstract = {Wait states in parallel applications can be identified by scanning event traces for characteristic patterns. In our earlier work we defined such inefficiency patterns for MPI-2 one-sided communication, although still based on a serial trace-analysis scheme with limited scalability. In this article we show how wait states in one-sided communications can be detected in a more scalable fashion by taking advantage of a new scalable trace-analysis approach based on a parallel replay, which was originally developed for MPI-1 point-to-point and collective communication. Moreover, we demonstrate the scalability of our method and its usefulness for the optimization cycle with applications running on up to 32,768 cores.},
eprint={http://hpc.sagepub.com/content/early/2011/06/03/1094342011406758.full.pdf+html},publisher={Sage}
}

@INPROCEEDINGS{Calotoiu_ea:2012:CollectivesDetection,
     author = {Calotoiu, Alexandru and Siebert, Christian and Wolf, Felix},
   keywords = {collective operations, HPL, MPI, performance optimization},
      month = aug,
      title = {Pattern-Independent Detection of Manual Collectives in {MPI} Programs},
  booktitle = {Proc. of the 18th Euro-Par Conference, Rhodes Island, Greece},
     series = {Lecture Notes in Computer Science},
     volume = {7484},
       year = {2012},
      pages = {28--39},
  publisher = {Springer},
   location = {Berlin / Heidelberg},
       issn = {0302-9743},
       isbn = {978-3-642-32819-0},
        doi = {10.1007/978-3-642-32820-6_5},
   abstract = {In parallel applications, a signicant amount of communication
occurs in a collective fashion to perform, for example, broadcasts,
reductions, or complete exchanges. Although the MPI standard defines
many convenience functions for this purpose, which not only improve
code readability and maintenance but are usually also highly efficient,
many application programmers still create their own, manual implementations
using point-to-point communication. We show how instances of
such hand-crafted collectives can be automatically detected. Matching
pre- and post-conditions of hashed message exchanges recorded in event
traces, our method is independent of the specific communication pattern
employed. We demonstrate that replacing detected broadcasts in
the HPL benchmark can yield significant performance improvements.}
}

@INPROCEEDINGS{schmidl_ea:2012:OpenMP_Task_Analysis,
     author = {Schmidl, Dirk and Philippen, Peter and Lorenz, Daniel and R{\"{o}}ssel, Christian and Geimer, Markus and an Mey, Dieter and Mohr, Bernd and Wolf, Felix},
      month = jun,
      title = {Performance Analysis Techniques for Task-Based {OpenMP} Applications},
  booktitle = {Proc. of the 8th International Workshop on OpenMP (IWOMP), Rome, Italy},
     series = {Lecture Notes in Computer Science},
     volume = {7312},
       year = {2012},
      pages = {196--209},
  publisher = {Springer},
    address = {Berlin / Heidelberg},
       isbn = {978-3-642-30960-1},
        doi = {10.1007/978-3-642-30961-8_15}
}

@INPROCEEDINGS{dbo:2012:criticalpath,
     author = {B{\"{o}}hme, David and de Supinski, Bronis R. and Geimer, Markus and Schulz, Martin and Wolf, Felix},
      month = may,
      title = {Scalable Critical-Path Based Performance Analysis},
  booktitle = {Proc. of the 26th IEEE International Parallel and Distributed Processing Symposium (IPDPS), Shanghai, China},
       year = {2012},
      pages = {1330--1340},
  publisher = {IEEE},
       issn = {1530-2075},
        doi = {10.1109/IPDPS.2012.120},
   abstract = {The critical path, which describes the longest execution sequence
without wait states in a parallel program, identifies the activities
that determine the overall program runtime. Combining knowledge of the
critical path with traditional parallel profiles, we have defined a
set of compact performance indicators that help answer a variety of important
performance-analysis questions, such as identifying load imbalance,
quantifying the impact of imbalance on runtime, and characterizing
resource consumption. By replaying event traces in parallel, we can
calculate these performance indicators in a highly
scalable way, making them a suitable analysis instrument for massively
parallel programs with thousands of processes. Case studies with
real-world parallel applications confirm that---in comparison to
traditional profiles---our indicators provide enhanced insight into
program behavior, especially when evaluating partitioning schemes of
MPMD programs.}
}

@INPROCEEDINGS{dbo:2012:phdforum,
     author = {B{\"{o}}hme, David and Geimer, Markus and Wolf, Felix},
      month = {May},
      title = {Characterizing Load and Communication Imbalance in Large-Scale Parallel Applications},
  booktitle = {Proc. of the 26th IEEE International Parallel and Distributed Processing Symposium Workshops and PhD Forum (IPDPSW), Shanghai, China},
       year = {2012},
      pages = {2538--2541},
  publisher = {IEEE},
       isbn = {978-1-4673-0974-5},
        doi = {10.1109/IPDPSW.2012.321},
   abstract = {Load or communication imbalance prevents many codes from taking advantage of the parallelism available on modern supercomputers. We present two scalable methods to highlight imbalance in parallel programs: The first method identifies delays that inflict wait states at subsequent synchronization points, and attributes their costs in terms of resource waste to the original cause. The second method combines knowledge of the critical path with traditional parallel profiles to derive a set of compact performance indicators that help answer a variety of important performance-analysis questions, such as identifying load imbalance, quantifying the impact of imbalance on runtime, and characterizing resource consumption. Both methods employ a highly scalable parallel replay of event traces, making them a suitable analysis instrument for massively parallel MPI programs with tens of thousands of processes.},
date-added={2012-03-15 18:05:43 +0100},
date-modified={2012-03-15 18:18:46 +0100},
}

@ARTICLE{wolf:2011:inside,
     author = {Wolf, Felix},
      title = {Understanding the Formation of Wait States in Parallel Programs},
    journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
     volume = {1},
     number = {9},
       year = {2011},
      pages = {94--95},
  publisher = {Gauss Centre for Supercomputing},
        url = {http://inside.hlrs.de/_old/htm/Edition_01_11/article_23.html}
}

@INCOLLECTION{Wolf:2011:Scalasca,
     author = {Wolf, Felix},
      month = oct,
      title = {Scalasca},
  booktitle = {Encyclopedia of Parallel Computing},
       year = {2011},
      pages = {1775--1785},
  publisher = {Springer},
       isbn = {978-0-387-09765-7},
        url = {https://link.springer.com/referenceworkentry/10.1007%2F978-0-387-09766-4_61},
   crossref = {Encyclopedia:2011}
}



crossreferenced publications: 
@BOOK{Encyclopedia:2011,
     editor = {Padua, David},
      month = oct,
      title = {Encyclopedia of Parallel Computing},
  booktitle = {Encyclopedia of Parallel Computing},
    edition = {1},
       year = {2011},
  publisher = {Springer}
}

@INPROCEEDINGS{mussler_ea:2011:cobi,
     author = {Mu{\ss}ler, Jan and Lorenz, Daniel and Wolf, Felix},
      month = sep,
      title = {Reducing the overhead of direct application instrumentation using prior static analysis},
  booktitle = {Proc. of the 17th Euro-Par Conference, Bordeaux, France},
     series = {Lecture Notes in Computer Science},
     volume = {6852},
       year = {2011},
      pages = {65--76},
  publisher = {Springer},
        doi = {10.1007/978-3-642-23400-2_7},
   abstract = {Preparing performance measurements of HPC applications is usually a
  tradeoff between accuracy and granularity of the measured data. When
  using direct instrumentation, that is, the insertion of extra code
  around performance-relevant functions, the measurement overhead
  increases with the rate at which these functions are visited. If
  applied indiscriminately, the measurement dilation can even be
  prohibitive. In this paper, we show how static code analysis in
  combination with binary re-writing can help eliminate unnecessary
  instrumentation points based on configurable filter rules. In
  contrast to earlier approaches, our technique does not rely on
  dynamic information, making extra runs prior to the actual
  measurement dispensable. Moreover, the rules can be applied and
  modified without re-compilation. We evaluate filter rules designed
  for the analysis of computation and communication performance and
  show that in most cases the measurement dilation can be reduced to a
  few percent while still retaining significant detail.}
}

@INPROCEEDINGS{geimer_ea:2011:eurompi,
     author = {Geimer, Markus and Hermanns, Marc-Andr{\'{e}} and Siebert, Christian and Wolf, Felix and Wylie, Brian J. N.},
      month = sep,
      title = {Scaling Performance Tool {MPI} Communicator Management},
  booktitle = {Proc. of the 18th European MPI Users' Group Meeting (EuroMPI), Santorini, Greece},
     series = {Lecture Notes in Computer Science},
     volume = {6960},
       year = {2011},
      pages = {178--187},
  publisher = {Springer},
       isbn = {978-3-642-2448-3},
        doi = {10.1007/978-3-642-24449-0_21}
}

@INPROCEEDINGS{hermanns_ea:2011:OneSidedReplay,
     author = {Hermanns, Marc-Andr{\'{e}} and Krishnamoorthy, Sriram and Wolf, Felix},
   keywords = {event tracing, one-sided communication, Performance Analysis, Remote Memory Access},
      month = jun,
      title = {A Scalable Replay-based Infrastructure for the Performance Analysis of One-sided Communication},
  booktitle = {Proc. of the 1st Intl. Workshop on High-performance Infrastructure for Scalable Tools (WHIST), held in conjunction with the International Conference on Supercomputing (ICS), Tucson, AZ, USA},
       year = {2011}
}

@INPROCEEDINGS{szebenyi11_ea:2011:hybrid_sampling,
     author = {Szebenyi, Zolt{\'{a}}n and Gamblin, Todd and Schulz, Martin and de Supinski, Bronis R. and Wolf, Felix and Wylie, Brian J. N.},
      month = may,
      title = {Reconciling Sampling and Direct Instrumentation for Unintrusive Call-Path Profiling of {MPI} Programs},
  booktitle = {Proc. of the 25th IEEE International Parallel and  Distributed Processing Symposium (IPDPS), Anchorage, AK, USA},
       year = {2011},
      pages = {640--648},
  publisher = {IEEE},
       isbn = {978-0-7695-4385-7},
        doi = {10.1109/IPDPS.2011.67},
   abstract = {We can profile the performance behavior of parallel
programs at the level of individual call paths through sampling or
direct instrumentation. While we can easily control measurement
dilation by adjusting the sampling frequency, the statistical
nature of sampling and the difficulty of accessing the parameters
of sampled events make it unsuitable for obtaining certain
communication metrics, such as the size of message payloads.
Alternatively, direct instrumentation, which is preferable for
capturing message-passing events, can excessively dilate measurements,
particularly for C++ programs, which often have many
short but frequently called class member functions. Thus, we
combine these techniques in a unified framework that exploits
the strengths of each approach while avoiding their weaknesses:
We use direct instrumentation to intercept MPI routines while we
record the execution of the remaining code through low-overhead
sampling. One of the main technical hurdles mastered was the
inexpensive and portable determination of call-path information
during the invocation of MPI routines.We show that the overhead
of our implementation is sufficiently low to support substantial
performance improvement of a C++ fluid-dynamics code.}
}

@INPROCEEDINGS{wylie:2011:pflotran,
     author = {Wylie, Brian J. N. and Geimer, Markus},
      month = may,
      title = {Large-scale performance analysis of {PFLOTRAN} with {Scalasca}},
  booktitle = {Proc. of the 53rd Cray User Group meeting, Fairbanks, AK, USA},
       year = {2011},
  publisher = {Cray User Group Inc.},
        url = {https://cug.org/5-publications/proceedings_attendee_lists/CUG11CD/pages/1-program/final_program/Thursday/16B-Wylie-Paper.pdf},
   abstract = {The PFLOTRAN code for multiphase subsurface flow and reactive transport has featured prominently in US Department of Energy SciDAC and INCITE programmes, where is has been used to simulate migration of radionucleide contaminants in groundwater. As part of its ongoing development, execution performance with up to 128k processor cores on Cray XT and IBM BG/P systems has been investigated, and a variety of aspects have been identified to inhibit PFLOTRAN performance at larger scales using the open-source Scalasca toolset. Scalability of Scalasca measurements and analyses themselves, previously demonstrated with a range of applications and benchmarks, required re-engineering in key areas to handle the  omplexities of PFLOTRAN executions employing MPI within PETSc, LAPACK, BLAS and HDF5 libraries at large scale.}
}

@INPROCEEDINGS{szebenyi-ea:2011:perf_analysis,
     author = {Szebenyi, Zolt{\'{a}}n and Wolf, Felix and Wylie, Brian J. N.},
      month = may,
      title = {Performance Analysis of Long-running Applications},
  booktitle = {Proc. of the 25th IEEE International Parallel and Distributed Processing Symposium (IPDPS) PhD Forum, Anchorage, AK, USA},
       year = {2011},
      pages = {2100--2103},
  publisher = {IEEE},
       isbn = {978-0-7695-4385-7},
        doi = {10.1109/IPDPS.2011.388},
   abstract = {With the growing complexity of supercomputing
applications and systems, it is important to constantly develop
existing performance measurement and analysis tools to provide
new insights into application performance characteristics and
thereby help scientists and engineers utilize computing resources
more efficiently. We present the various new techniques developed,
implemented and integrated into the Scalasca toolset
specifically to enhance performance analysis of long-running
applications. The first is a hybrid measurement system seamlessly
integrating sampled and event-based measurements capable
of low-overhead, highly detailed measurements and therefore
particularly convenient for initial performance analyses. Then
we apply iteration profiling to scientific codes, and present an
algorithm for reducing the memory and space requirements of
the collected data using iteration profile clustering. Finally, we
evaluate the complete integration of all these techniques in a
unified measurement system.}
}

@INCOLLECTION{geimer_ea:2010:recentdevelopments,
     author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and Becker, Daniel and B{\"{o}}hme, David and Frings, Wolfgang and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and Szebenyi, Zolt{\'{a}}n},
     editor = {M{\"{u}}ller, Matthias S. and Resch, Michael M. and Nagel, Wolfgang E. and Schulz, Alexander},
      title = {Recent Developments in the {Scalasca} Toolset},
  booktitle = {Tools for High Performance Computing 2009, Proc. of the 3rd Parallel Tools Workshop, Dresden, Germany, September 2009},
    chapter = {4},
       year = {2010},
      pages = {39--51},
  publisher = {Springer},
       isbn = {978-3-642-11260-7},
        doi = {10.1007/978-3-642-11261-4_4}
}

@INCOLLECTION{wylie_ea:2010:imscalascatoolsupport,
     author = {Wylie, Brian J. N.},
     editor = {Monfardini, Silvia},
      title = {Improved {Scalasca} toolset support for performance analysis of {Cray XT} systems},
  booktitle = {HPC-Europa2: Science and Supercomputing in Europe - Research Highlights 2009},
       year = {2010},
      pages = {67},
  publisher = {CINECA Consorzio Interuniversitario},
    address = {Casalecchio di Reno (Bologna), Italy},
       isbn = {978-88-86037-23-5},
        url = {http://www.hpc-europa.eu/files/SSCinEurope/CD2009/contents/064_info-Wylie.pdf}
}

@ARTICLE{mohr_ea:2010:permeasanalysis,
    author = {Mohr, Bernd and Wylie, Brian J. N. and Wolf, Felix},
     title = {Performance measurement and analysis tools for extremely scalable systems},
   journal = {Concurrency and Computation: Practice and Experience},
    volume = {22},
    number = {16},
      year = {2010},
     pages = {2212--2229},
      note = {(ISC 2008 Award)},
       doi = {10.1002/cpe.1585},
  abstract = {High-performance computing systems continue to employ more and more processor cores. Current typical high-end machines in industry, university, and government research laboratory computing centers feature thousands of computing cores. While these machines promise ever more compute power and memory capacity to tackle today’s complex simulation problems, they force application developers to greatly enhance the scalability of their codes to be able to exploit it. To better support them in their porting and tuning process, many parallel-tools research groups have already started to work on scaling their methods, techniques, and tools to extreme processor counts. In this paper, we survey existing profiling and tracing tools, report on our experience in using them in extreme scaling environments, review working and promising new methods and techniques, and discuss strategies for solving open issues and problems.},
publisher={Wiley}
}

@PHDTHESIS{bercker:2010:dissertation,
     author = {Becker, Daniel},
      title = {Timestamp Synchronization of Concurrent Events},
     volume = {4},
       year = {2010},
  publisher = {Verlag Forschungszentrum J{\"{u}}lich GmbH Zentralbibliothek},
     school = {RWTH Aachen University},
    address = {volume 4 of IAS Series, Forschungszentrum J\"ulich},
       note = {{ISBN} 978-3-89336-625-5},
        url = {http://hdl.handle.net/2128/3787},
        doi = {2128/3787}
}

@INCOLLECTION{hermanns:2010:imonesidedsupport,
     author = {Hermanns, Marc-Andr{\'{e}}},
     editor = {Monfardini, Silvia},
      title = {HPC-Europa2: Science and Supercomputing in Europe research highlights 2009},
  booktitle = {HPC-Europa2: Science and Supercomputing in Europe research highlights 2010},
       year = {2010},
      pages = {101},
  publisher = {CINECA Consorzio Interuniversitario},
    address = {Casalecchio di Reno (Bologna), Italy},
       isbn = {978-88-86037-24-2}
}

@ARTICLE{Wylie_ea:2010:LargeScaleSweep3D,
    author = {Wylie, Brian J. N. and Geimer, Markus and Mohr, Bernd and B{\"{o}}hme, David and Szebenyi, Zolt{\'{a}}n and Wolf, Felix},
  keywords = {parallel performance measurement & analysis, scalability},
     month = dec,
     title = {Large-scale performance analysis of {Sweep3D} with the {Scalasca} toolset},
   journal = {Parallel Processing Letters},
    volume = {20},
    number = {4},
      year = {2010},
     pages = {397--414},
       doi = {10.1142/S0129626410000314},
  abstract = {Cray XT and IBM Blue Gene systems present current alternative approaches to constructing leadership computer systems relying on applications being able to exploit very large configurations of processor cores, and associated analysis tools must also scale commensurately to isolate and quantify performance issues that manifest at the largest scales. In studying the scalability of the Scalasca performance analysis toolset to several hundred thousand MPI processes on XT5 and BG/P systems, we investigated a progressive execution performance deterioration of the well-known ASCI Sweep3D compact application. Scalasca runtime summarization analysis quantified MPI communication time that correlated with computational imbalance, and automated trace analysis confirmed growing amounts of MPI waiting times. Further instrumentation, measurement and analyses pinpointed a conditional section of highly imbalanced computation which amplified waiting times inherent in the associated wavefront communication that seriously degraded overall execution efficiency at very large scales. By employing effective data collation, management and graphical presentation, in a portable and straightforward to use toolset, Scalasca was thereby able to demonstrate performance measurements and analyses with 294,912 processes.},
publisher={World Scientific}
}

@INPROCEEDINGS{Boehme_ea:2010:RootCauseAnalysis,
     author = {B{\"{o}}hme, David and Geimer, Markus and Wolf, Felix and Arnold, Lukas},
      month = sep,
      title = {Identifying the root causes of wait states in large-scale parallel  applications},
  booktitle = {Proc. of the 39th International Conference on Parallel Processing (ICPP), San Diego, CA, USA},
       year = {2010},
      pages = {90--100},
  publisher = {IEEE},
       note = {Best Paper Award},
       issn = {0190-3918},
       isbn = {978-1-4244-7913-9},
        doi = {10.1109/ICPP.2010.18}
}

@INPROCEEDINGS{becker:2010:hybrid_clc,
     author = {Becker, Daniel and Geimer, Markus and Rabenseifner, Rolf and Wolf, Felix},
      month = sep,
      title = {Synchronizing the Timestamps of Concurrent Events in Traces of Hybrid {MPI/OpenMP} Applications},
  booktitle = {Proc. of IEEE International Conference on Cluster Computing (CLUSTER), Heraklion, Greece},
       year = {2010},
      pages = {38--47},
  publisher = {IEEE},
       isbn = {978-0-7695-4220-1},
        doi = {10.1109/CLUSTER.2010.13}
}

@INPROCEEDINGS{lorenz_ea:2010:InstrumentTasks,
     author = {Lorenz, Daniel and Mohr, Bernd and R{\"{o}}ssel, Christian and Schmidl, Dirk and Wolf, Felix},
      month = jun,
      title = {How to reconcile event-based performance analysis with tasking in {OpenMP}},
  booktitle = {Proc. of 6th Int. Workshop of OpenMP (IWOMP), Tsukuba, Japan},
     series = {Lecture Notes in Computer Science},
     volume = {6132},
       year = {2010},
      pages = {109--121},
  publisher = {Springer},
       isbn = {978-3-642-13216-2},
        doi = {10.1007/978-3-642-13217-9_9},
   abstract = {With version 3.0, the OpenMP specification introduced a task construct and
  with it an additional dimension of concurrency. While offering a convenient
  means to express task parallelism, the new construct presents a serious
  challenge to event-based performance analysis. Since tasking may disrupt the
  classic sequence of region entry and exit events, essential analysis
  procedures such as reconstructing dynamic call paths or correctly
  attributing performance metrics to individual task region instances may
  become impossible. To overcome this limitation, we describe a portable
  method to distinguish individual task instances and to track their
  suspension and resumption with event-based instrumentation.  Implemented as
  an extension of the OPARI source-code instrumenter, our portable solution
  supports C/C++ programs with tied tasks and with untied tasks that are
  suspended only at implied scheduling points, while introducing only
  negligible measurement overhead. Finally, we discuss possible extensions of
  the OpenMP specification to provide general support for task identifiers
  with untied tasks.}
}

@INPROCEEDINGS{wylie_ea:2010:scalableper,
     author = {Wylie, Brian J. N. and B{\"{o}}hme, David and Frings, Wolfgang and Geimer, Markus and Mohr, Bernd and Szebenyi, Zolt{\'{a}}n and Becker, Daniel and Hermanns, Marc-Andr{\'{e}} and Wolf, Felix},
      month = may,
      title = {Scalable performance analysis of large-scale parallel applications on {Cray} {XT} systems with {Scalasca}},
  booktitle = {Proc. 52nd Cray User Group Meeting, Edinburgh, Scotland},
       year = {2010},
  publisher = {Cray User Group Incorporated},
        url = {https://cug.org/5-publications/proceedings_attendee_lists/CUG10CD/pages/1-program/final_program/CUG10_Proceedings/pages/authors/06-10Tuesday/9B-Wylie-paper.pdf},
   abstract = {The open-source Scalasca toolset [www.scalasca.org] supports integrated runtime summarization and automated trace analysis on a diverse range of HPC computer systems.  An HPC-Europa2 visit to EPCC in
2009 resulted in significantly enhanced support for Cray XT systems, particularly the auxilliary programming environments and hybrid OpenMP/MPI.  Combined with its previously demonstrated extreme scalability and portable performance analyses comparison capabilities, Scalasca has been used to analyse and tune numerous key applications (and benchmarks) on Cray XT and other PRACE prototype systems, from which experience with a representative selection is reviewed.}
}

@ARTICLE{geimer_ea:2010:scalascaarchitecture,
    author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and {\'{A}}brah{\'{a}}m, Erika and Becker, Daniel and Mohr, Bernd},
     month = apr,
     title = {The {Scalasca} performance toolset architecture},
   journal = {Concurrency and Computation: Practice and Experience},
    volume = {22},
    number = {6},
      year = {2010},
     pages = {702--719},
       doi = {10.1002/cpe.1556},
publisher={Wiley}
}

@INPROCEEDINGS{wylie_ea:2010:peranalysissweep3D,
     author = {Wylie, Brian J. N. and B{\"{o}}hme, David and Mohr, Bernd and Szebenyi, Zolt{\'{a}}n and Wolf, Felix},
      month = apr,
      title = {Performance analysis of {Sweep3D} on {Blue Gene/P} with the {Scalasca} toolset},
  booktitle = {Proc. 24th International Parallel and Distributed Processing Symposium and Workshops (IPDPS), Atlanta, GA, USA},
       year = {2010},
  publisher = {IEEE},
       isbn = {978-1-4244-6532-3},
        doi = {10.1109/IPDPSW.2010.5470816},
   abstract = {In studying the scalability of the Scalasca performance analysis toolset to several hundred thousand MPI processes on IBM Blue\,Gene/P, we investigated a progressive execution performance deterioration of the well-known ASCI Sweep3D compact application.  Scalasca runtime summarization analysis quantified MPI communication time that correlated with computational imbalance, and automated trace analysis confirmed growing amounts of MPI waiting times.  Further instrumentation, measurement and analyses pinpointed a conditional section of highly imbalanced computation which amplified waiting times inherent in the associated wavefront communication that seriously degraded overall execution efficiency at very large scales.  By employing effective data collation, management and graphical presentation, Scalasca was thereby able to demonstrate performance measurements and analyses with 294,912 processes for the first time.}
}

@INPROCEEDINGS{boehme_ea:2010:nonblock_simulator,
     author = {B{\"{o}}hme, David and Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Wolf, Felix},
      month = mar,
      title = {Performance Simulation of Non-blocking Communication in Message-Passing Applications},
  booktitle = {Proc. of the 2nd Workshop on Productivity and Performance (PROPER) in conjunction with Euro-Par 2009, Delft, The Netherlands},
     series = {Lecture Notes in Computer Science},
     volume = {6043},
       year = {2010},
      pages = {208--217},
  publisher = {Springer},
       issn = {0302-9743},
        doi = {10.1007/978-3-642-14122-5_25},
   abstract = {In our previous work, we introduced performance simulation as an instrument to verify hypotheses on causality between locally and spatially distant performance phenomena without altering the application itself. This is accomplished by modifying MPI event traces and using them to simulate hypothetical message-passing behavior. Here, we present enhancements to our approach, which was previously restricted to blocking communication, that now allow us to correctly simulate MPI non-blocking communication. We enhanced the underlying trace data format to record communication requests, and extended the simulator to even retain the inherently non-deterministic behavior of operations such as MPI_Waitany.}
}

@INPROCEEDINGS{Wolf_ea:2010:nic_symposium,
        author = {Wolf, Felix and B{\"{o}}hme, David and Geimer, Markus and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and Szebenyi, Zolt{\'{a}}n and Wylie, Brian J. N.},
        editor = {M{\"{u}}nster, Gernot and Wolf, Dietrich and Kremer, Manfred},
         month = feb,
         title = {Performance Tuning in the Petascale Era},
     booktitle = {Proc. of the John von Neumann Institute for Computing (NIC) Symposium 2010, Juelich, Germany},
        series = {IAS Series},
        volume = {3},
          year = {2010},
         pages = {339--346},
     publisher = {John von Neumann-Institut for Computing},
  organization = {Forschungszentrum J{\"{u}}lich},
          isbn = {978-3-89336-606-4}
}

@INPROCEEDINGS{szebenyi_ea:2008:pepc,
     author = {Szebenyi, Zolt{\'{a}}n and Wylie, Brian J. N. and Wolf, Felix},
      title = {Scalasca Parallel Performance Analyses of {PEPC}},
  booktitle = {Proc. of the 1st Workshop on Productivity and Performance (PROPER) in conjunction with Euro-Par 2008, Las Palmas de Gran Canaria, Spain},
     series = {Lecture Notes in Computer Science},
     volume = {5415},
       year = {2009},
      pages = {305--314},
  publisher = {Springer},
       issn = {0302-9743},
        doi = {10.1007/978-3-642-00955-6_35}
}

@ARTICLE{wolf:2009:toolsforpetascalesystems,
    author = {Wolf, Felix},
     title = {Performance Tools for Petascale Systems},
   journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
    volume = {7},
    number = {2},
      year = {2009},
     pages = {38--39},
       url = {http://inside.hlrs.de/_old/htm/Edition_02_09/article_10.html}
}

@ARTICLE{becker_ea:2009:timestampsynchronization,
    author = {Becker, Daniel and Rabenseifner, Rolf and Wolf, Felix and Linford, John},
     month = dec,
     title = {Scalable timestamp synchronization for event traces of message-passing applications},
   journal = {Parallel Computing},
    volume = {35},
    number = {12},
      year = {2009},
     pages = {595--607},
       doi = {10.1016/j.parco.2008.12.012},
publisher={Elsevier}
}

@INPROCEEDINGS{szebenyi_ea:2009:timeseries,
     author = {Szebenyi, Zolt{\'{a}}n and Wolf, Felix and Wylie, Brian J. N.},
      month = {November},
      title = {Space-Efficient Time-Series Call-Path Profiling of Parallel Applications},
  booktitle = {Proc. of the ACM/IEEE Conference on Supercomputing (SC09), Portland, OR, USA},
       year = {2009},
  publisher = {ACM},
       isbn = {978-1-60558-744-8},
        doi = {10.1145/1654059.1654097}
}

@INPROCEEDINGS{frings_ea:2009:parallelio,
     author = {Frings, Wolfgang and Wolf, Felix and Petkov, Ventsislav},
      month = {November},
      title = {Scalable Massively Parallel {I/O} to Task-Local Files},
  booktitle = {Proc. of the ACM/IEEE Conference on Supercomputing (SC09), Portland, OR, USA},
       year = {2009},
  publisher = {ACM},
       isbn = {978-1-60558-744-8},
        doi = {10.1145/1654059.1654077}
}

@INPROCEEDINGS{hermanns_ea:2009:rmadetection,
     author = {Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Mohr, Bernd and Wolf, Felix},
      month = {September-October},
      title = {Scalable Detection of {MPI}-2 Remote Memory Access Inefficiency Patterns},
  booktitle = {Proc. of the 16th European PVM/MPI Users' Group Meeting (EuroPVM/MPI), Espoo, Finland},
     series = {Lecture Notes in Computer Science},
     volume = {5759},
       year = {2009},
      pages = {31--41},
  publisher = {Springer},
       issn = {1094-3420},
       isbn = {978-3-642-03769-6},
        doi = {10.1007/978-3-642-03770-2_10}
}

@ARTICLE{geimer_ea:2009:diagnosingwaitstates,
    author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and Mohr, Bernd},
     month = jul,
     title = {A scalable tool architecture for diagnosing wait states in massively parallel applications},
   journal = {Parallel Computing},
    volume = {35},
    number = {7},
      year = {2009},
     pages = {375--388},
      issn = {0167-8191},
       doi = {10.1016/j.parco.2009.02.003}
}

@INPROCEEDINGS{geimer_ea:2009:instrumentor,
     author = {Geimer, Markus and Shende, Sameer S. and Malony, Allen D. and Wolf, Felix},
     editor = {Allen, Gabrielle and Nabrzyski, Jarek and Seidel, Ed and van Albada, Geert Dick and Dongarra, Jack and Sloot, Peter M. A.},
      month = may,
      title = {A Generic and Configurable Source-Code Instrumentation Component},
  booktitle = {Proc. of the International Conference on Computational Science (ICCS), Baton Rouge, LA, USA},
     series = {Lecture Notes in Computer Science},
     volume = {5545},
       year = {2009},
      pages = {696--705},
  publisher = {Springer},
       isbn = {978-3-642-01972-2},
        doi = {10.1007/978-3-642-01973-9_78}
}

@MASTERSTHESIS{hermanns:2009:silas,
    author = {Hermanns, Marc-Andr{\'{e}}},
     month = may,
     title = {Trace-based performance simulation of large-scale applications},
      year = {2009},
    school = {University of Hagen},
       url = {http://www.fz-juelich.de/jsc/docs/autoren2009/hermanns2}
}

@ARTICLE{becker_ea:2009:replaybasedsynchronization,
    author = {Becker, Daniel and Rabenseifner, Rolf and Wolf, Felix and Linford, John},
     month = mar,
     title = {Replay-based synchronization of timestamps in event traces of massively parallel applications},
   journal = {Scalable Computing: Practice and Experience},
    volume = {10},
    number = {1},
      year = {2009},
     pages = {49--60},
      issn = {1895-1767},
       url = {https://www.scpe.org/index.php/scpe/article/view/600}
}

@INPROCEEDINGS{hermanns_ea:2009:verification,
     author = {Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Wolf, Felix and Wylie, Brian J. N.},
      month = {February},
      title = {Verifying Causality Between Distant Performance Phenomena in Large-Scale {MPI} Applications},
  booktitle = {Proc. of the 17th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing (PDP), Weimar, Germany},
       year = {2009},
      pages = {78--84},
  publisher = {IEEE},
       isbn = {978-0-7695-3544-9},
        doi = {10.1109/PDP.2009.50}
}

@ARTICLE{wylie_ea:2008:performancemeasurement,
    author = {Wylie, Brian J. N. and Geimer, Markus and Wolf, Felix},
     title = {Performance measurement and analysis of large-scale parallel applications on leadership computing systems},
   journal = {Scientific Programming},
    volume = {16},
    number = {2-3},
      year = {2008},
     pages = {167--181},
      issn = {1058-9244},
       url = {https://www.hindawi.com/journals/sp/2008/632685/abs/},
       doi = {10.3233/SPR-2008-0255}
}

@INCOLLECTION{wolf_ea:2008:scalascausage,
     author = {Wolf, Felix and Wylie, Brian J. N. and {\'{A}}brah{\'{a}}m, Erika and Becker, Daniel and Frings, Wolfgang and F{\"{u}}rlinger, Karl and Geimer, Markus and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and Moore, Shirley and Pfeifer, Matthias and Szebenyi, Zolt{\'{a}}n},
      title = {Usage of the {SCALASCA} Toolset for Scalable Performance Analysis of Large-Scale Parallel Applications},
  booktitle = {Tools for High Performance Computing, Proc. of the 2nd Parallel Tools Workshop, Stuttgart, Germany, July 2008},
       year = {2008},
      pages = {157--167},
  publisher = {Springer},
       isbn = {ISBN 978-3-540-68561-6},
        doi = {10.1007/978-3-540-68564-7_10}
}

@INBOOK{petkov:2008:gaststudentenprogrammergebnisse,
     author = {Petkov, Ventsislav},
      month = {December},
      title = {Beitr{\"{a}}ge zum Wissenschaftlichen Rechnen -- Ergebnisse des Gaststudentenprogramms 2008 des John von Neumann-Instituts f{\"{u}}r Computing},
    chapter = {SIONlib - Scalable I/O Library for Native Parallel Access to Binary Files},
       year = {2008},
      pages = {93-105},
  publisher = {Forschungszentrum J{\"{u}}lich, Technical Report FZJ-JSC-IB-2008-07}
}

@INPROCEEDINGS{becker_ea:2008:clockdrifts,
     author = {Becker, Daniel and Rabenseifner, Rolf and Wolf, Felix},
      month = {September},
      title = {Implications of non-constant clock drifts for the timestamps of concurrent events},
  booktitle = {Proc. of the IEEE International Conference on Cluster Computing (CLUSTER), Tsukuba, Japan},
       year = {2008},
      pages = {59--68},
  publisher = {IEEE},
       issn = {1552-5244},
       isbn = {978-1-4244-2639-3},
        doi = {10.1109/CLUSTR.2008.4663756}
}

@INPROCEEDINGS{becker_ea:2008:timestampsynchronization,
     author = {Becker, Daniel and Linford, John and Rabenseifner, Rolf and Wolf, Felix},
      month = {sep},
      title = {Replay-based synchronization of timestamps in event traces of massively parallel applications},
  booktitle = {Proc. of the International Conference on Parallel Processing Workshops (ICPPW), 1st International Workshop on Simulation and Modelling in Emergent Computational Systems (SMECS), Portland, OR, USA},
       year = {2008},
      pages = {212--219},
  publisher = {IEEE},
       issn = {0190-3918},
       isbn = {978-0-7695-3375-9},
        doi = {10.1109/ICPP-W.2008.17}
}

@INPROCEEDINGS{becker_ea:2008:grid-basedworkflow,
     author = {Becker, Daniel and Riedel, Morris and Streit, Achim and Wolf, Felix},
      month = {June},
      title = {Grid-Based Workflow Management for Automatic Performance Analysis of Massively Parallel Applications},
  booktitle = {Proc. of the 3rd CoreGRID Workshop on Grid Middleware, Barcelona, Spain},
     series = {CoreGRID Series},
       year = {2008},
      pages = {103--118},
  publisher = {Springer},
       isbn = {978-0-387-85965-1},
        doi = {10.1007/978-0-387-85966-8_8}
}

@INPROCEEDINGS{szebenyi_ea:2008:spec_mpi2007,
     author = {Szebenyi, Zolt{\'{a}}n and Wylie, Brian J. N. and Wolf, Felix},
      month = {June},
      title = {{SCALASCA} Parallel Performance Analyses of {SPEC MPI2007} Applications},
  booktitle = {Proc. of the 1st SPEC International Performance Evaluation Workshop (SIPEW), Darmstadt, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {5119},
       year = {2008},
      pages = {99--123},
  publisher = {Springer},
       isbn = {978-3-540-69813-5},
        doi = {10.1007/978-3-540-69814-2_8}
}

@INPROCEEDINGS{geimer_ea:2008:scalascaarchitecture,
     author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and {\'{A}}brah{\'{a}}m, Erika and Becker, Daniel and Mohr, Bernd},
      month = {June},
      title = {The {SCALASCA} Performance Toolset Architecture},
  booktitle = {International Workshop on Scalable Tools for High-End Computing (STHEC), Kos, Greece},
       year = {2008},
      pages = {51--65}
}

@INPROCEEDINGS{hernandez_ea:2008:compileroptimizations,
     author = {Hernandez, Oscar and Song, Fengguang and Chapman, Barbara and Dongarra, Jack and Mohr, Bernd and Moore, Shirley and Wolf, Felix},
      month = {June},
      title = {Performance Instrumentation and Compiler Optimizations for {MPI/OpenMP} Applications},
  booktitle = {Proc. of the 2nd International Workshop on OpenMP (IWOMP 2006), Reims, France},
     series = {Lecture Notes in Computer Science},
     volume = {4315},
       year = {2008},
      pages = {267--278},
  publisher = {Springer},
       isbn = {978-3-540-68554-8},
        doi = {10.1007/978-3-540-68555-5_22}
}

@TECHREPORT{hermanns_ea:2008:causalconnections,
       author = {Hermanns, Marc-Andr{\'{e}} and Geimer, Markus and Wolf, Felix and Wylie, Brian J. N.},
        month = {April},
        title = {Verifying Causal Connections between Distant Performance Phenomena in Large-Scale Message-Passing Applications},
         type = {Technical Report},
       number = {FZJ-JSC-IB-2008-05},
         year = {2008},
  institution = {Forschungszentrum J{\"{u}}lich}
}

@INPROCEEDINGS{becker_ea:2008:optimization,
     author = {Becker, Daniel and Frings, Wolfgang and Wolf, Felix},
      month = {February},
      title = {Performance Evaluation and Optimization of Parallel Grid Computing Applications},
  booktitle = {Proc. of the 16th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), Toulouse, France},
       year = {2008},
      pages = {193--199},
  publisher = {IEEE},
       issn = {1066-6192},
       isbn = {978-0-7695-0389-5},
        doi = {10.1109/PDP.2008.27}
}

@INPROCEEDINGS{wolf_ea:2008:performanceanalysisfornextgeneration,
     author = {Wolf, Felix and Becker, Daniel and Geimer, Markus and Wylie, Brian J. N.},
      month = {February},
      title = {Scalable Performance Analysis Methods for the Next Generation of Supercomputers},
  booktitle = {Proc. of the John von Neumann Institute for Computing (NIC) Symposium, J{\"{u}}lich, Germany},
     series = {NIC-Series},
     volume = {39},
       year = {2008},
      pages = {315--322},
       isbn = {978-3-9810843-5-1}
}

@INPROCEEDINGS{geimer_ea:2006:scalableperformanceanalysis,
     author = {Geimer, Markus and Wolf, Felix and Kn{\"{u}}pfer, Andreas and Mohr, Bernd and Wylie, Brian J. N.},
      title = {A Parallel Trace-Data Interface for Scalable Performance Analysis},
  booktitle = {Proc. of the 8th International Workshop on State-of-the-Art in Scientific and Parallel Computing (PARA), Ume{\aa}, Sweden, June 2006},
     series = {Lecture Notes in Computer Science},
     volume = {4699},
       year = {2007},
      pages = {398--408},
  publisher = {Springer},
       isbn = {978-3-540-75754-2},
        doi = {10.1007/978-3-540-75755-9_49}
}

@INPROCEEDINGS{wylie_ea:2006:runtimemeasurement,
     author = {Wylie, Brian J. N. and Wolf, Felix and Mohr, Bernd and Geimer, Markus},
      title = {Integrated Runtime Measurement Summarisation and Selective Event Tracing for Scalable Parallel Execution Performance Diagnosis},
  booktitle = {Proc. of the 8th International Workshop on State-of-the-Art in Scientific and Parallel Computing (PARA), Ume{\aa}, Sweden, June 2006},
     series = {Lecture Notes in Computer Science},
     volume = {4699},
       year = {2007},
      pages = {460--469},
  publisher = {Springer},
       isbn = {978-3-540-75754-2},
        doi = {10.1007/978-3-540-75755-9_55}
}

@ARTICLE{bischof_ea:2007:produktivitaetvsperformanz,
    author = {Bischof, Christian and Wolf, Felix},
     title = {{Produktivit{\"{a}}t versus Performanz in der Simulation}},
   journal = {RWTH Themen},
    volume = {2},
      year = {2007},
     pages = {38--39}
}

@ARTICLE{behbahani_ea:2007:krankenherzenhelfen,
    author = {Behbahani, M. and Behr, Marek and Bischof, Christian and Wolf, Felix},
     title = {{Kranken Herzen helfen}},
   journal = {RWTH Themen},
    volume = {1},
      year = {2007},
     pages = {44--46}
}

@INPROCEEDINGS{becker_ea:2007:optimization,
        author = {Becker, Daniel and Frings, Wolfgang and Wolf, Felix},
         month = {December},
         title = {Performance Evaluation and Optimization of Metacomputing Applications},
     booktitle = {Proc. of the 3rd Workshop on Communication in Cluster- and Grid-Systems (KiCC, Kommunikation in Clusterrechnern und Clusterverbundsystemen), Aachen, Germany},
          year = {2007},
         pages = {32--39},
  organization = {RWTH Aachen University},
           url = {http://nemo.ub.rwth-aachen.de/record/115518}
}

@TECHREPORT{linford:2007:researchreport,
       author = {Linford, John},
        month = nov,
        title = {{CESRI 2007 Research Report} - {I}mplementation and Validation of the Extended Controlled Logical Clock},
       number = {FZJ-JSC-IB-2007-11},
         year = {2007},
  institution = {Forschungszentrum J{\"{u}}lich}
}

@INPROCEEDINGS{geimer_ea:2007:scalablecollation,
     author = {Geimer, Markus and Kuhlmann, Bj{\"{o}}rn and Pulatova, Farzona and Wolf, Felix and Wylie, Brian J. N.},
      month = {September},
      title = {Scalable Collation and Presentation of Call-Path Profile Data with {CUBE}},
  booktitle = {Proc. of the Conference on Parallel Computing (ParCo), Aachen/J{\"{u}}lich, Germany},
       year = {2007},
      pages = {645--652},
       note = {{\em Minisymposium Scalability and Usability of HPC Programming Tools}},
       issn = {0927-5452},
       isbn = {978-1-58603-796-3}
}

@INPROCEEDINGS{becker_ea:2006:timestampsynchronization,
     author = {Becker, Daniel and Rabenseifner, Rolf and Wolf, Felix},
      month = {September-October},
      title = {Timestamp Synchronization for Event Traces of Large-Scale Message-Passing Applications},
  booktitle = {Proc. of the 14th European PVM/MPI Users' Group Meeting (EuroPVM/MPI), Paris, France},
     series = {Lecture Notes in Computer Science},
     volume = {4757},
       year = {2007},
      pages = {315--325},
  publisher = {Springer},
       isbn = {978-3-540-75415-2},
        doi = {10.1007/978-3-540-75416-9_43}
}

@INPROCEEDINGS{wylie_ea:2007:xns_cfd,
     author = {Wylie, Brian J. N. and Geimer, Markus and Nicolai, Mike and Probst, Markus},
      month = sep,
      title = {Performance analysis and tuning of the {XNS CFD} solver on {BlueGene/L}},
  booktitle = {Proc. of the 14th European PVM/MPI Users' Group Meeting (EuroPVM/MPI), Paris, France},
     series = {Lecture Notes in Computer Science},
     volume = {4757},
       year = {2007},
      pages = {107--116},
  publisher = {Springer}
}

@ARTICLE{malony_ea:2007:measurementoverheadcompensation,
     author = {Malony, Allen D. and Shende, Sameer S. and Morris, Alan and Wolf, Felix},
      month = may,
      title = {Compensation of Measurement Overhead in Parallel Performance Profiling},
    journal = {International Journal of High Performance Computing Applications},
     volume = {21},
     number = {2},
       year = {2007},
      pages = {174--194},
  publisher = {SAGE Publications},
       issn = {1094-3420},
        doi = {10.1177/1094342007077862}
}

@INCOLLECTION{wylie:2007:transnationalaccessreport,
     author = {Wylie, Brian J. N.},
     editor = {Alberigo, Paola and Erbacci, Giovanni and Garofalo, Francesca and Monfardini, Silvia},
      month = apr,
      title = {Scalable performance analysis of large-scale parallel applications on {MareNostrum}},
  booktitle = {Science and Supercomputing in Europe},
     series = {HPC-Europa Transnational Access Report},
       year = {2007},
      pages = {453-461},
  publisher = {CINECA Consorzio Interuniversitario},
    address = {Casalecchio di Reno (Bologna), Italy},
       note = {Also available as SSCinEurope 2007 CD},
       isbn = {978-88-86037-21-1},
        url = {http://www.hpc-europa.eu/files/SSCinEurope/CD2007/contents/087-com-science-Whylie.pdf},
   abstract = {The {\sf scalasca} toolset was readily ported to Mare Nostrum and its
measurement and automated analysis scalability verified to 1024
processes with the {\sc smg2000} benchmark and {\sc wrf} numerical
weather prediction application.  It was subsequently compared with the
local Paraver/MPItrace tools, albeit with difficulty due to their quite
different modes of operation, and found to be complementary:  {\sf
scalasca} is able to rapidly generate automated analysis summaries from
the largest configurations, which can be used to direct much more
extensive Paraver trace analyses to particularly significant performance
issues.  A previously unexpected problem with exiting MPI communication
collectives was identified and codified in a new {\sf scalasca} ``{\sl N
x N Completion}'' property, allowing its significance to be investigated and
instances to be examined with Paraver.}
}

@INPROCEEDINGS{becker_ea:2007:performanceanalysis,
     author = {Becker, Daniel and Wolf, Felix and Frings, Wolfgang and Geimer, Markus and Wylie, Brian J. N. and Mohr, Bernd},
      month = {March},
      title = {Automatic Trace-Based Performance Analysis of Metacomputing Applications},
  booktitle = {Proc. of the International Parallel and Distributed Processing Symposium (IPDPS), Long Beach, CA, USA},
       year = {2007},
  publisher = {IEEE},
       issn = {1530-2075},
       isbn = {1-4244-0909-8},
        doi = {10.1109/IPDPS.2007.370238}
}

@ARTICLE{wolf_ea:2007:inefficiencypatternanalysis,
     author = {Wolf, Felix and Mohr, Bernd and Dongarra, Jack and Moore, Shirley},
      month = feb,
      title = {Automatic analysis of inefficiency patterns in parallel applications},
    journal = {Concurrency and Computation: Practice and Experience},
     volume = {19},
     number = {11},
       year = {2007},
      pages = {1481--1496},
  publisher = {Wiley},
        doi = {10.1002/cpe.1128},
issue={11},
}

@ARTICLE{geimer_ea:2006:articleaboutperformanceanalysis,
    author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and Mohr, Bernd},
     title = {Scalable Parallel Trace-Based Performance Analysis},
   journal = {Innovatives Supercomputing in Deutschland (inSiDE)},
    volume = {4},
    number = {2},
      year = {2006},
     pages = {16--19},
       url = {http://inside.hlrs.de/_old/htm/Edition_02_06/article_06.htm}
}

@INPROCEEDINGS{geimer_ea:2006:performanceanalysis,
     author = {Geimer, Markus and Wolf, Felix and Wylie, Brian J. N. and Mohr, Bernd},
      month = {September},
      title = {Scalable Parallel Trace-Based Performance Analysis},
  booktitle = {Proc. of the 13th European PVM/MPI Users' Group Meeting (EuroPVM/MPI), Bonn, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {4192},
       year = {2006},
      pages = {303--312},
  publisher = {Springer},
       isbn = {978-3-540-39110-4},
        doi = {10.1007/11846802_43}
}

@INPROCEEDINGS{kuehnal_ea:2006:inefficiencypatterns,
     author = {K{\"{u}}hnal, Andrej and Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and Wolf, Felix},
      month = {August - September},
      title = {Specification of Inefficiency Patterns for {MPI}-2 One-sided Communication},
  booktitle = {Proc. of the 12th Euro-Par Conference, Dresden, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {4128},
       year = {2006},
      pages = {47--62},
  publisher = {Springer},
       isbn = {978-3-540-37783-2},
        doi = {10.1007/11823285_6}
}

@INPROCEEDINGS{aguilera_ea:2006:multi-step,
     author = {Aguilera, Gaby and Teller, Patricia J. and Taufer, Michaela and Wolf, Felix},
      month = {April},
      title = {A Systematic Multi-step Methodology for Performance Analysis of Communication Traces of Distributed Applications based on Hierarchical Clustering},
  booktitle = {Proc. of the 5th International Workshop on Performance Modeling, Evaluation, and Organization of Parallel and Distributed Systems (PMEO-PDS, in conjunction with IPDPS 2006), Rhodes Island, Greece},
       year = {2006},
  publisher = {IEEE},
       issn = {1530-2075},
       isbn = {1-4244-0054-6},
        doi = {10.1109/IPDPS.2006.1639645}
}

@INPROCEEDINGS{wolf_ea:2006:largeeventtraces,
     author = {Wolf, Felix and Freitag, Felix and Mohr, Bernd and Moore, Shirley and Wylie, Brian J. N.},
      month = {March},
      title = {Large Event Traces in Parallel Performance Analysis},
  booktitle = {Proc. of the 8th Workshop on Parallel Systems and Algorithms (PASA), Frankfurt, Germany},
     series = {Lecture Notes in Informatics},
     volume = {P-81},
       year = {2006},
      pages = {264--273},
  publisher = {Gesellschaft f{\"{u}}r Informatik},
       isbn = {3-88579-175-7}
}

@INPROCEEDINGS{wolf_ea:2005:overheadcompensation,
     author = {Wolf, Felix and Malony, Allen D. and Shende, Sameer S. and Morris, Alan},
      month = {September},
      title = {Trace-Based Parallel Performance Overhead Compensation},
  booktitle = {Proc. of the International Conference on High Performance Computing and Communications (HPCC), Sorrento, Italy},
     series = {Lecture Notes in Computer Science},
     volume = {3726},
       year = {2005},
      pages = {617--628},
  publisher = {Springer},
       isbn = {978-3-540-29031-5},
        doi = {10.1007/11557654_72}
}

@INPROCEEDINGS{moore_ea:2005:approach,
     author = {Moore, Shirley and Wolf, Felix and Dongarra, Jack and Shende, Sameer S. and Malony, Allen D. and Mohr, Bernd},
      month = {September},
      title = {A Scalable Approach to {MPI} Application Performance Analysis},
  booktitle = {Proc. of the 12th European PVM/MPI Users' Group Meeting (EuroPVM/MPI), Sorrento, Italy},
     series = {Lecture Notes in Computer Science},
     volume = {3666},
       year = {2005},
      pages = {309--316},
  publisher = {Springer},
       isbn = {978-3-540-29009-4},
        doi = {10.1007/11557265_41}
}

@INPROCEEDINGS{wylie_ea:2005:hardwarecounter,
     author = {Wylie, Brian J. N. and Mohr, Bernd and Wolf, Felix},
      month = {September},
      title = {Holistic Hardware Counter Performance Analysis of Parallel Programs},
  booktitle = {Proc. of the Conference on Parallel Computing (ParCo), Malaga, Spain},
       year = {2005},
      pages = {187--194},
       isbn = {3-00-017352-8}
}

@INPROCEEDINGS{mohr_ea:2005:communication,
     author = {Mohr, Bernd and K{\"{u}}hnal, Andrej and Hermanns, Marc-Andr{\'{e}} and Wolf, Felix},
      month = {September},
      title = {Performance Analysis of One-sided Communication Mechanisms},
  booktitle = {Proc. of the Conference on Parallel Computing (ParCo), Malaga, Spain},
       year = {2005},
       note = {{\em Minisymposium Performance Analysis}},
       isbn = {3-00-017352-8}
}

@INPROCEEDINGS{hermanns_ea:2005:measurement,
     author = {Hermanns, Marc-Andr{\'{e}} and Mohr, Bernd and Wolf, Felix},
      month = {August-September},
      title = {Event-based Measurement and Analysis of One-sided Communication},
  booktitle = {Proc. of the 11th Euro-Par Conference, Lisboa, Portugal},
     series = {Lecture Notes in Computer Science},
     volume = {3648},
       year = {2005},
      pages = {156--165},
  publisher = {Springer},
       isbn = {978-3-540-28700-1},
        doi = {10.1007/11549468_20}
}

@INPROCEEDINGS{mohr_ea:2005:caf,
     author = {Mohr, Bernd and DeRose, Luiz A. and Vetter, Jeffrey S.},
      month = {August-September},
      title = {{A} {P}erformance {M}easurement {I}nfrastructure for {C}o-{A}rray {F}ortran},
  booktitle = {Proc. of the 4th Euro-Par Conference, Lisboa, Portugal},
     series = {Lecture Notes in Computer Science},
     volume = {3648},
       year = {2005},
      pages = {156-165},
  publisher = {Springer},
        doi = {10.1007/11549468_19}
}

@INPROCEEDINGS{bhatia_ea:2005:communictaion,
     author = {Bhatia, Nikhil and Song, Fengguang and Wolf, Felix and Mohr, Bernd and Dongarra, Jack and Moore, Shirley},
      month = {June},
      title = {Automatic Experimental Analysis of Communication Patterns in Virtual Topologies},
  booktitle = {Proc. of the International Conference on Parallel Processing (ICPP), Oslo, Norway},
       year = {2005},
      pages = {465--472},
  publisher = {IEEE Society},
       issn = {0190-3918},
       isbn = {0-7695-2380-3},
        doi = {10.1109/ICPP.2005.21}
}

@INPROCEEDINGS{worley_ea:2005:performanceanalysisgyro,
     author = {Worley, P. and Candy, J. and Carrington, L. and Huck, K. and Kaiser, T. and Mahinthakumar, G. and Malony, Allen D. and Moore, Shirley and Reed, D. and Roth, P. and Shan, H. and Shende, Sameer S. and Snavely, A. and Sreepathi, S. and Wolf, Felix and Zhang, Y.},
      month = {June},
      title = {Performance Analysis of {GYRO}: A Tool Evaluation},
  booktitle = {Proc. of the 2005 SciDAC Conference, San Francisco, CA, USA},
       year = {2005}
}

@INPROCEEDINGS{bhatia_ea:2005:approach,
     author = {Bhatia, Nikhil and Moore, Shirley and Wolf, Felix and Dongarra, Jack and Mohr, Bernd},
      month = {May},
      title = {A Pattern-Based Approach to Automated Application Performance Analysis},
  booktitle = {Workshop on Patterns in High Performance Computing (patHPC 2005), Urbana-Champaign, IL, USA},
       year = {2005}
}

@INPROCEEDINGS{moore_ea:2005:solution,
     author = {Moore, Shirley and Wolf, Felix and Dongarra, Jack and Mohr, Bernd},
      month = {February},
      title = {Improving Time to Solution with Automated Performance Analysis},
  booktitle = {2nd Workshop on Productivity and Performance in High-End Computing (P-PHEC), San Francisco, CA, USA},
       year = {2005}
}

@TECHREPORT{song_ea:2004:cube,
       author = {Song, Fengguang and Wolf, Felix},
        title = {{CUBE} {U}ser {M}anual},
       number = {ICL-UT-04-01},
         year = {2004},
  institution = {University of Tennessee, Innovative Computing Laboratory}
}

@TECHREPORT{wolf:2004:earl,
       author = {Wolf, Felix},
        month = {October},
        title = {{EARL} - {API} {D}ocumentation},
       number = {ICL-UT-04-03},
         year = {2004},
  institution = {University of Tennessee, Innovative Computing Laboratory}
}

@INPROCEEDINGS{wolf_ea:2004:patternsearch,
     author = {Wolf, Felix and Mohr, Bernd and Dongarra, Jack and Moore, Shirley},
      month = {August - September},
      title = {Efficient Pattern Search in Large Traces through Successive Refinement},
  booktitle = {Proc. of the 10th Euro-Par Conference, Pisa, Italy},
     series = {Lecture Notes in Computer Science},
     volume = {3149},
       year = {2004},
      pages = {47--54},
  publisher = {Springer},
       issn = {0302-9743},
       isbn = {978-3-540-22924-7},
        doi = {10.1007/b99409}
}

@INPROCEEDINGS{song:2004:algebra,
     author = {Song, Fengguang and Wolf, Felix and Bhatia, Nikhil and Dongarra, Jack and Moore, Shirley},
      month = {August},
      title = {An Algebra for Cross-Experiment Performance Analysis},
  booktitle = {Proc. of the International Conference on Parallel Processing (ICPP), Montreal, Canada},
       year = {2004},
      pages = {63--72},
  publisher = {IEEE Society},
       issn = {0190-3918},
       isbn = {0-7695-2197-5},
        doi = {10.1109/ICPP.2004.1327905}
}

@INPROCEEDINGS{mucci:2004:large-scale,
     author = {Mucci, Philip and Dongarra, Jack and Kufrin, Rick and Moore, Shirley and Song, Fengguang and Wolf, Felix},
      month = {May},
      title = {Automating the Large-Scale Collection and Analysis of Performance Data on Linux Clusters},
  booktitle = {5th LCI International Conference on Linux Clusters: The HPC Revolution, Austin, TX, USA},
       year = {2004},
        url = {http://www.linuxclustersinstitute.org/conferences/archive/2004/technicalpapers.html}
}

@ARTICLE{wolf_ea:2003:system,
    author = {Wolf, Felix and Mohr, Bernd},
     month = nov,
     title = {Automatic performance analysis of hybrid {MPI}/{OpenMP} applications},
   journal = {Journal of Systems Architecture},
    volume = {49},
    number = {10-11},
      year = {2003},
     pages = {421--439},
       doi = {10.1016/S1383-7621(03)00102-4}
}

@INPROCEEDINGS{wolf_ea:2003:hardware-counter,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {September},
      title = {Hardware-Counter Based Automatic Performance Analysis of Parallel Programs},
  booktitle = {Proc. of the Conference on Parallel Computing (ParCo), Dresden, Germany},
     series = {Advances in Parallel Computing},
     volume = {13},
       year = {2003},
      pages = {753--760},
  publisher = {Elsevier},
       note = {Minisymposium {\em Performance Analysis}},
        doi = {10.1016/S0927-5452(04)80092-3}
}

@INPROCEEDINGS{wolf_ea:2003:kojak,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {August},
      title = {{KOJAK} - {A} Tool Set for Automatic Performance Analysis of Parallel Applications},
  booktitle = {Proc. of the 9th Euro-Par Conference, Klagenfurt, Austria},
     series = {Lecture Notes in Computer Science},
     volume = {2790},
       year = {2003},
      pages = {1301--1304},
  publisher = {Springer},
       note = {Demonstrations of Parallel and Distributed Computing},
       isbn = {978-3-540-40788-1},
        doi = {10.1007/978-3-540-45209-6_177}
}

@PHDTHESIS{wolf:2003:PerformanceAnalysis,
    author = {Wolf, Felix},
     month = {February},
     title = {Automatic Performance Analysis on Parallel Computers with {SMP} Nodes},
      year = {2003},
    school = {RWTH Aachen},
   address = {Forschungszentrum J\"ulich},
      note = {NIC Series Volume 17, {ISBN} 3-00-010003-2},
       url = {http://hdl.handle.net/2128/2928}
}

@INPROCEEDINGS{wolf_ea:2003:automaticanalysis,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {feb},
      title = {Automatic Performance Analysis of Hybrid {MPI}/{OpenMP} Applications},
  booktitle = {Proc. of 11th Euromicro Workshop on Parallel Distributed and Network-Based Processing (PDP), Genua, Italy},
       year = {2003},
      pages = {13--22},
  publisher = {IEEE},
       issn = {1066-6192},
       isbn = {0-7695-1875-3},
        doi = {10.1109/EMPDP.2003.1183560}
}

@INPROCEEDINGS{mohr_ea:2001:monitoring,
     author = {Mohr, Bernd and Malony, Allen D. and Hoppe, H. C. and Schlimbach, F. and Haab, G. and Hoeflinger, J. and Shah, S.},
      month = {September},
      title = {{A} {P}erformance {M}onitoring {I}nterface for {OpenMP}},
  booktitle = {Proceedings of Fourth European Workshop on OpenMP (EWOMP), Rome, Italy},
       year = {2002}
}

@INPROCEEDINGS{derose_ea:2002:performancemetrics,
     author = {DeRose, Luiz A. and Wolf, Felix},
      month = {August},
      title = {{CATCH} -- {A} Call-Graph Based Automatic Tool for Capture of Hardware Performance Metrics for {MPI} and {OpenMP} Applications},
  booktitle = {Proc. of the 8th Euro-Par Conference, Paderborn, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {2400},
       year = {2002},
      pages = {167--176},
  publisher = {Springer},
       issn = {0302-9743},
       isbn = {978-3-540-44049-9},
        doi = {10.1007/3-540-45706-2}
}

@ARTICLE{mohr_ea:2002:designtoolinterface,
     author = {Mohr, Bernd and Malony, Allen D. and Shende, Sameer S. and Wolf, Felix},
      month = {aug},
      title = {Design and Prototype of a Performance Tool Interface for {OpenMP}},
    journal = {The Journal of Supercomputing},
     volume = {23},
     number = {1},
       year = {2002},
      pages = {105--128},
  publisher = {Kluwer Academic Publishers},
       issn = {0920-8542},
        doi = {10.1023/A:1015741304337}
}

@INPROCEEDINGS{mohr_ea:2001:designperformancetoolinterface,
     author = {Mohr, Bernd and Malony, Allen D. and Shende, Sameer S. and Wolf, Felix},
      month = {oct},
      title = {Design and Prototype of a Performance Tool Interface for {OpenMP}},
  booktitle = {2nd Annual Los Alamos Computer Science Institute Symposium (LACSI), Santa Fe, NM, USA},
       year = {2001}
}

@ARTICLE{wolf_ea:2001:specifyingperformanceproperties,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {sep},
      title = {Specifying Performance Properties of Parallel Applications Using Compound Events},
    journal = {Parallel and Distributed Computing Practices},
     volume = {4},
     number = {3},
       year = {2001},
      pages = {301--317},
  publisher = {Nova Science Publishers, Inc.},
       issn = {1097-2803},
        url = {https://www.scpe.org/index.php/scpe/article/view/249}
}

@INPROCEEDINGS{mohr_ea:2001:directiverewriting,
     author = {Mohr, Bernd and Malony, Allen D. and Shende, Sameer S. and Wolf, Felix},
      month = {September},
      title = {Towards a Performance Tool Interface for OpenMP: An Approach based on Directive Rewriting},
  booktitle = {3rd European Workshop on OpenMP (EWOMP), Barcelona, Spain},
       year = {2001}
}

@TECHREPORT{Fahringer_ea:2001:ESPRIT,
       author = {Fahringer, Thomas and Gerndt, Michael and Mohr, Bernd and Riley, G. and Tr{\"{a}}ff, J. L. and Wolf, Felix},
        month = {August},
        title = {{K}nowledge {S}pecification for {A}utomatic {P}erformance {A}nalysis},
       number = {FZJ-ZAM-IB-2001-08},
         year = {2001},
  institution = {ESPRIT IV Working Group APART},
      address = {Forschungszentrum J{\"{u}}lich},
         note = {Revised version}
}

@INPROCEEDINGS{Lindlan_ea:2000:framework,
     author = {Lindlan, K. A. and Cuny, J. and Malony, Allen D. and Mohr, Bernd and Rivenburgh, R. and Rasmussen, C.},
      month = {November},
      title = {{A} {T}ool {F}ramework for {S}tatic and {D}ynamic {A}nalysis of {O}bject-{O}riented {S}oftware with {T}emplates},
  booktitle = {Proc. of the Supercomputing Conference (SC2000), Dallas, TX, USA},
       year = {2000}
}

@INPROCEEDINGS{wolf_ea:2000:automaticperformanceanalysis,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {August-September},
      title = {Automatic Performance Analysis of {MPI} Applications Based on Event Traces},
  booktitle = {Proc. of the 6th Euro-Par Conference, Munich, Germany},
     series = {Lecture Notes in Computer Science},
     volume = {1900},
       year = {2000},
      pages = {123--132},
  publisher = {Springer},
       issn = {0302-9743},
       isbn = {978-3-540-67956-1},
        doi = {10.1007/3-540-44520-X_16}
}

@INPROCEEDINGS{gerndt_ea:2000:spec,
        author = {Gerndt, Michael and E{\ss}er, Hans-Georg},
         month = {January},
         title = {Specification Techniques for Automatic Performance Analysis Tools},
     booktitle = {Proc. of the 8th International Workshop on Compilers for Parallel Computers (CPC), Aussois, France},
          year = {2000},
  organization = {Ecole Normale Sup{\'{e}}rieure Lyon}
}

@INPROCEEDINGS{wolf_ea:1999:earl,
     author = {Wolf, Felix and Mohr, Bernd},
      month = {April},
      title = {{EARL} - {A} Programmable and Extensible Toolkit for Analyzing Event Traces of Message Passing Programs},
  booktitle = {Proc. of the 7th International Conference on High Performance Computing and Networking Europe (HPCN), Amsterdam, The Netherlands},
     series = {Lecture Notes in Computer Science},
     volume = {1593},
       year = {1999},
      pages = {503--512},
  publisher = {Springer},
       isbn = {978-3-540-65821-4},
        doi = {10.1007/bfb0100611}
}

@INPROCEEDINGS{gerndt_ea:1999:performanceanalysis,
     author = {Gerndt, Michael and Mohr, Bernd and Wolf, Felix and Pantano, Mario},
      month = {February},
      title = {Performance Analysis on {Cray T3E}},
  booktitle = {Proc. of the 7th Euromicro Workshop on Parallel and Distributed Processing (PDP), Funchal, Madeira, Portugal},
       year = {1999},
      pages = {241--248},
  publisher = {IEEE},
       isbn = {0-7695-0059-5},
        url = {https://ieeexplore.ieee.org/document/746679}
}

@INPROCEEDINGS{gerndt_ea:1998:automaticperformanceanalysis,
     author = {Gerndt, Michael and Mohr, Bernd and Pantano, Mario and Wolf, Felix},
      month = {June-July},
      title = {Automatic Performance Analysis for {Cray T3E}},
  booktitle = {Proc. of the 7th Workshop on Compilers for Parallel Computers (CPC), University of Link{\"{o}}ping, Sweden},
       year = {1998},
      pages = {69--78}
}