Source code for ensemble_md.utils.utils

####################################################################
#                                                                  #
#    ensemble_md,                                                  #
#    a python package for running GROMACS simulation ensembles     #
#                                                                  #
#    Written by Wei-Tse Hsu <wehs7661@colorado.edu>                #
#    Copyright (c) 2022 University of Colorado Boulder             #
#                                                                  #
####################################################################
"""
The :obj:`.utils` module provides useful utility functions for running or analyzing REXEE simulations.
"""
import sys
import glob
import natsort
import datetime
import subprocess
import collections
import numpy as np


[docs]class Logger:
    """
    A logger class that redirects the STDOUT and STDERR to a specified output file while
    preserving the output on screen. This is useful for logging terminal output to a file
    for later analysis while still seeing the output in real-time during execution.

    Parameters
    ----------
    logfile : str
        The file path of which the standard output and standard error should be logged.

    Attributes
    ----------
    terminal : :code:`io.TextIOWrapper` object
        The original standard output object, typically :code:`sys.stdout`.
    log : :code:`io.TextIOWrapper` object
        File object used to log the output in append mode.
    """

    def __init__(self, logfile):
        self.terminal = sys.stdout
        self.log = open(logfile, "a")

[docs]    def write(self, message):
        """
        Writes a message to the terminal and to the log file.

        Parameters
        ----------
        message : str
            The message to be written to STDOUT and the log file.
        """
        self.terminal.write(message)
        self.log.write(message)

[docs]    def flush(self):
        """
        This method is needed for Python 3 compatibility. This handles the flush command by doing nothing.
        Some extra behaviors may be specified here.
        """
        # self.terminal.log()
        pass


[docs]def run_gmx_cmd(arguments, prompt_input=None):
    """
    Runs a GROMACS command through a subprocess call.

    Parameters
    ----------
    arguments : list
        A list of arguments that compose of the GROMACS command to run, e.g.,
        :code:`['gmx', 'mdrun', '-deffnm', 'sys']`.
    prompt_input : str or None, Optional
        The input to be passed to the interative prompt launched by the GROMACS command, if any.

    Returns
    -------
    return_code : int
        The exit code of the GROMACS command. Any number other than 0 indicates an error.
    stdout : str or None
        The STDOUT of the process.
    stderr: str or None
        The STDERR or the process.
    """
    try:
        result = subprocess.run(arguments, capture_output=True, text=True, input=prompt_input, check=True)
        return_code, stdout, stderr = result.returncode, result.stdout, None
    except subprocess.CalledProcessError as e:
        return_code, stdout, stderr = e.returncode, None, e.stderr

    return return_code, stdout, stderr


[docs]def format_time(t):
    """
    Converts time in seconds to a more readable format.

    Parameters
    ----------
    t : float
        The time in seconds.

    Returns
    -------
    t_str : str
        A string representing the time duration in a format of "X hour(s) Y minute(s) Z second(s)", adjusting the units
        as necessary based on the input duration, e.g., 1 hour(s) 0 minute(s) 0 second(s) for 3600 seconds and
        15 minute(s) 30 second(s) for 930 seconds.
    """
    hh_mm_ss = str(datetime.timedelta(seconds=t)).split(":")

    if "day" in hh_mm_ss[0]:
        # hh_mm_ss[0] will contain "day" and cannot be converted to float
        hh, mm, ss = hh_mm_ss[0], float(hh_mm_ss[1]), float(hh_mm_ss[2])
        t_str = f"{hh} hour(s) {mm:.0f} minute(s) {ss:.0f} second(s)"
    else:
        hh, mm, ss = float(hh_mm_ss[0]), float(hh_mm_ss[1]), float(hh_mm_ss[2])
        if hh == 0:
            if mm == 0:
                t_str = f"{ss:.1f} second(s)"
            else:
                t_str = f"{mm:.0f} minute(s) {ss:.0f} second(s)"
        else:
            t_str = f"{hh:.0f} hour(s) {mm:.0f} minute(s) {ss:.0f} second(s)"

    return t_str


def _convert_to_numeric(s):
    """
    Converts the input to a numerical type when possible. This internal function is used for the MDP parser.

    Parameters
    ----------
    s : any
        The input value to be converted to a numerical type if possible. The data type of :code:`s` is
        usually :code:`str` but can be any. However, if :code:`s` is not a string, it will be returned as is.

    Returns
    -------
    numerical : any
        The converted numerical value. If :code:`s` can be converted to a single numerical value,
        that value is returned as an :code:`int` or :code:`float`. If :code:`s` can be converted to
        multiple numerical values, a list containing those values is returned.
        If :code:`s` cannot be converted to a numerical value, :code:`s` is returned as is.
    """
    if type(s) is not str:
        return s
    for converter in int, float, str:  # try them in increasing order of lenience
        try:
            s = [converter(i) for i in s.split()]
            if len(s) == 1:
                return s[0]
            else:
                return s
        except (ValueError, AttributeError):
            pass


def _get_subplot_dimension(n_panels):
    """
    Gets the number of rows and columns for a subplot based on the number of panels such
    that the subplots are arranged in a grid that is as square as possible. A greater number
    of columns is preferred to a greater number of rows.

    Parameters
    ----------
    n_panels : int
        The number of panels to be arranged in subplots.

    Example
    -------
    >>> from ensemble_md.utils import utils
    >>> utils._get_subplot_dimension(10)
    (4, 3)
    """
    if int(np.sqrt(n_panels) + 0.5) ** 2 == n_panels:
        # perfect square number
        n_cols = int(np.sqrt(n_panels))
    else:
        n_cols = int(np.floor(np.sqrt(n_panels))) + 1

    if n_panels % n_cols == 0:
        n_rows = int(np.floor(n_panels / n_cols))
    else:
        n_rows = int(np.floor(n_panels / n_cols)) + 1

    return n_rows, n_cols


[docs]def weighted_mean(vals, errs):
    """
    Calculates the inverse-variance-weighted mean. Note that if
    any error is 0, the simple mean will be returned.

    Parameters
    ----------
    vals : list
        A list of values to be averaged.
    errs : list
        A list of errors corresponding to the given values

    Returns
    -------
    mean : float
        The inverse-variance-weighted mean.
    err : float
        The propgated error of the mean.
    """
    if 0 in errs:
        # This could happen in the very beginning of a simulation, which could lead to an ZeroDivision warning/error.
        # In this case, we just ignore the w vector and just calculate the simple average.
        mean, err = np.mean(vals), None
        return mean, err

    w = [1 / (i ** 2) for i in errs]
    wx = [w[i] * vals[i] for i in range(len(vals))]
    mean = np.sum(wx) / np.sum(w)
    err = np.sqrt(1 / np.sum(w))

    return mean, err


[docs]def calc_rmse(data, ref):
    """
    Calculates the root mean square error (RMSE) of the given data
    with respect to the reference data.

    Parameters
    ----------
    data : list
        A list of values to be compared with the reference data.
    ref : list
        A list of reference values.

    Returns
    -------
    rmse : float
        The root mean square error.
    """
    rmse = np.sqrt(np.mean((np.array(data) - np.array(ref)) ** 2))

    return rmse


[docs]def get_time_metrics(log):
    """
    Gets the time-based metrics from a log file of a REXEE simulation, including the core time,
    wall time, and performance (ns/day).

    Parameters
    ----------
    log : str
        The file path of the input log file.

    Returns
    -------
    t_metrics : dict
        A dictionary having following keys: :code:`t_core`, :code:`t_wall`, :code:`performance`.
    """
    t_metrics = {}
    with open(log, 'r') as f:
        # Using deque is much faster than using readlines and reversing it since we only need the last few lines.
        lines = collections.deque(f, 20)  # get the last 20 lines should be enough

    for l in lines:  # noqa: E741
        if 'Performance: ' in l:
            t_metrics['performance'] = float(l.split()[1])  # ns/day
        if 'Time: ' in l:
            t_metrics['t_core'] = float(l.split()[1])  # s
            t_metrics['t_wall'] = float(l.split()[2])  # s

    return t_metrics


[docs]def analyze_REXEE_time(n_iter=None, log_files=None):
    """
    Performs simple data analysis on the wall times and performances of all iterations of an REXEE simulation.

    Parameters
    ----------
    n_iter : None or int, Optional
        The number of iterations in the REXEE simulation. If None, the function will try to find the number of
        iterations by counting the number of directories named in the format of :code`iteration_*` in the simulation
        directory (specifically :code:`sim_0`) in the current working directory or where the log files are located.
    log_files : None or list, Optional
        A list of lists of log paths with the shape of :code:`(n_iter, n_replicas)`. If None, the function will try to
        find the log files by searching the current working directory.

    Returns
    -------
    t_wall_tot : float
        The total wall time GROMACS spent to finish all iterations for the REXEE simulation.
    t_sync : float
        The total time spent in synchronizing all replicas, which is the sum of the differences
        between the longest and the shortest time elapsed to finish a iteration.
    t_wall_list : list
        The list of wall times for finishing each GROMACS mdrun command.
    """
    if n_iter is None:
        if log_files is None:
            n_iter = len(glob.glob('sim_0/iteration_*'))
        else:
            n_iter = len(log_files)

    if log_files is None:
        log_files = [natsort.natsorted(glob.glob(f'sim_*/iteration_{i}/*log')) for i in range(n_iter)]

    if len(log_files) == 0:
        raise FileNotFoundError("No sim/iteration directories found.")

    t_wall_list = []
    t_wall_tot, t_sync = 0, 0
    for i in range(n_iter):
        t_wall = [get_time_metrics(log_files[i][j])['t_wall'] for j in range(len(log_files[i]))]
        t_wall_list.append(t_wall)
        t_wall_tot += max(t_wall)
        t_sync += (max(t_wall) - min(t_wall))

    return t_wall_tot, t_sync, t_wall_list