Source code for horovod.common.basics

# Copyright (C) 2019 Uber Technologies, Inc.
# Modifications copyright Microsoft
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

import atexit
import ctypes

from horovod.common import util as util


class HorovodBasics(object):
    """Wrapper class for the basic Horovod API."""

    def __init__(self, pkg_path, *args):
        full_path = util.get_extension_full_path(pkg_path, *args)
        self.MPI_LIB_CTYPES = ctypes.CDLL(full_path, mode=ctypes.RTLD_GLOBAL)

        self.Average = self.MPI_LIB_CTYPES.horovod_reduce_op_average()
        self.Sum = self.MPI_LIB_CTYPES.horovod_reduce_op_sum()
        self.Adasum = self.MPI_LIB_CTYPES.horovod_reduce_op_adasum()

    def init(self, comm=None):
        """A function that initializes Horovod.

        Args:
          comm: List specifying ranks for the communicator, relative to the MPI_COMM_WORLD
            communicator OR the MPI communicator to use. Given communicator will be duplicated.
            If None, Horovod will use MPI_COMM_WORLD Communicator.
        """
        if comm is None:
            comm = []

        atexit.register(self.shutdown)

        if not isinstance(comm, list):
            mpi_built = self.MPI_LIB_CTYPES.horovod_mpi_built()
            if not bool(mpi_built):
                raise ValueError(
                    "Horovod has not been built with MPI support. Ensure MPI is installed and "
                    "reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.")

            from mpi4py import MPI
            if MPI._sizeof(MPI.Comm) == ctypes.sizeof(ctypes.c_int):
                MPI_Comm = ctypes.c_int
            else:
                MPI_Comm = ctypes.c_void_p
                self.MPI_LIB_CTYPES.horovod_init_comm.argtypes = [MPI_Comm]

            comm_obj = MPI_Comm.from_address(MPI._addressof(comm))
            self.MPI_LIB_CTYPES.horovod_init_comm(comm_obj)
        else:
            comm_size = len(comm)
            self.MPI_LIB_CTYPES.horovod_init(
                (ctypes.c_int * comm_size)(*comm), ctypes.c_int(comm_size))

    def shutdown(self):
        """A function that shuts Horovod down."""
        self.MPI_LIB_CTYPES.horovod_shutdown()

    def is_initialized(self):
        """Returns True if Horovod is initialized"""
        return self.MPI_LIB_CTYPES.horovod_is_initialized()

    def start_timeline(self, file_path, mark_cycles=False):
        """Creates a timeline file at `file_path` and begins recording.

        Args:
            file_path: String path to the timeline file.
            mark_cycles: Boolean indicating that cycles should be marked on
                         the timeline (default: False).

        Raises a `ValueError` if Horovod is not initialized.
        """
        result = self.MPI_LIB_CTYPES.horovod_start_timeline(
            ctypes.c_char_p(file_path.encode('utf-8')),
            ctypes.c_bool(mark_cycles))
        if not result:
            raise ValueError('Horovod has not been initialized; use hvd.init().')

    def stop_timeline(self):
        """Stops the active timeline recording and closes the file.

        Raises a `ValueError` if Horovod is not initialized.
        """
        result = self.MPI_LIB_CTYPES.horovod_stop_timeline()
        if not result:
            raise ValueError('Horovod has not been initialized; use hvd.init().')

    def size(self):
        """A function that returns the number of Horovod processes.

        Returns:
          An integer scalar containing the number of Horovod processes.
        """
        size = self.MPI_LIB_CTYPES.horovod_size()
        if size == -1:
            raise ValueError(
                'Horovod has not been initialized; use hvd.init().')
        return size

    def local_size(self):
        """A function that returns the number of Horovod processes within the
        node the current process is running on.

        Returns:
          An integer scalar containing the number of local Horovod processes.
        """
        local_size = self.MPI_LIB_CTYPES.horovod_local_size()
        if local_size == -1:
            raise ValueError(
                'Horovod has not been initialized; use hvd.init().')
        return local_size

    def rank(self):
        """A function that returns the Horovod rank of the calling process.

        Returns:
          An integer scalar with the Horovod rank of the calling process.
        """
        rank = self.MPI_LIB_CTYPES.horovod_rank()
        if rank == -1:
            raise ValueError(
                'Horovod has not been initialized; use hvd.init().')
        return rank

    def local_rank(self):
        """A function that returns the local Horovod rank of the calling process, within the
        node that it is running on. For example, if there are seven processes running
        on a node, their local ranks will be zero through six, inclusive.

        Returns:
          An integer scalar with the local Horovod rank of the calling process.
        """
        local_rank = self.MPI_LIB_CTYPES.horovod_local_rank()
        if local_rank == -1:
            raise ValueError(
                'Horovod has not been initialized; use hvd.init().')
        return local_rank

    def is_homogeneous(self):
        """Returns True if the cluster is homogeneous.

        Returns:
          A boolean value indicating whether every node in the cluster has same number of ranks.
        """
        is_homogeneous = self.MPI_LIB_CTYPES.horovod_is_homogeneous()
        return bool(is_homogeneous)

    def mpi_threads_supported(self):
        """A function that returns a flag indicating whether MPI multi-threading is supported.

        If MPI multi-threading is supported, users may mix and match Horovod usage with other
        MPI libraries, such as `mpi4py`.

        Returns:
          A boolean value indicating whether MPI multi-threading is supported.
        """
        mpi_enabled = self.MPI_LIB_CTYPES.horovod_mpi_enabled()
        if not bool(mpi_enabled):
            raise ValueError(
                'Horovod MPI is not enabled; Please make sure it\'s installed and enabled.')

        mpi_threads_supported = self.MPI_LIB_CTYPES.horovod_mpi_threads_supported()
        if mpi_threads_supported == -1:
            raise ValueError(
                'Horovod has not been initialized; use hvd.init().')
        return bool(mpi_threads_supported)

    def mpi_enabled(self):
        """Returns True if MPI is mode is currently enabled at runtime.

        If MPI is enabled, users can use it for controller or data transfer operations.

        Returns:
          A boolean value indicating whether MPI is enabled.
        """
        mpi_enabled = self.MPI_LIB_CTYPES.horovod_mpi_enabled()
        return bool(mpi_enabled)

    def mpi_built(self):
        """Returns True if Horovod was compiled with MPI support.

        Returns:
          A boolean value indicating whether MPI support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_mpi_built())

    def gloo_enabled(self):
        """Returns True if Gloo is mode is currently enabled at runtime.

        If Gloo is enabled, users can use it for controller or data transfer operations.

        Returns:
          A boolean value indicating whether Gloo is enabled.
        """
        gloo_enabled = self.MPI_LIB_CTYPES.horovod_gloo_enabled()
        return bool(gloo_enabled)

    def gloo_built(self):
        """Returns True if Horovod was compiled with Gloo support.

        Returns:
          A boolean value indicating whether Gloo support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_gloo_built())

    def nccl_built(self):
        """Function to check if Horovod was compiled with NCCL support.

        Returns:
          An integer value indicating whether NCCL support was compiled.
          If NCCL support was compiled, returns NCCL_VERSION_CODE. Otherwise,
          returns 0.
        """
        return int(self.MPI_LIB_CTYPES.horovod_nccl_built())

    def ddl_built(self):
        """Returns True if Horovod was compiled with DDL support.

        Returns:
          A boolean value indicating whether DDL support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_ddl_built())

    def ccl_built(self):
        """Returns True if Horovod was compiled with oneCCL support.

        Returns:
          A boolean value indicating whether oneCCL support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_ccl_built())

    def cuda_built(self):
        """Returns True if Horovod was compiled with CUDA support.

        Returns:
          A boolean value indicating whether CUDA support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_cuda_built())

    def rocm_built(self):
        """Returns True if Horovod was compiled with ROCm support.

        Returns:
          A boolean value indicating whether ROCm support was compiled.
        """
        return bool(self.MPI_LIB_CTYPES.horovod_rocm_built())