fm-orchestrator/module_build_service/utils.py

# Copyright (c) 2016  Red Hat, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Written by Ralph Bean <rbean@redhat.com>
#            Matt Prahl <mprahl@redhat.com>

""" Utility functions for module_build_service. """
import re
import copy
import functools
import time
import shutil
import tempfile
import os
import kobo.rpmlib
import inspect
import hashlib

import modulemd
import yaml

from flask import request, url_for
from datetime import datetime

from module_build_service import log, models
from module_build_service.errors import (ValidationError, UnprocessableEntity,
                                         ProgrammingError)
from module_build_service import conf, db
from module_build_service.errors import (Forbidden, Conflict)
import module_build_service.messaging
from multiprocessing.dummy import Pool as ThreadPool
import module_build_service.pdc
from module_build_service.pdc import resolve_requires

import concurrent.futures


def retry(timeout=conf.net_timeout, interval=conf.net_retry_interval, wait_on=Exception):
    """ A decorator that allows to retry a section of code...
    ...until success or timeout.
    """
    def wrapper(function):
        @functools.wraps(function)
        def inner(*args, **kwargs):
            start = time.time()
            while True:
                if (time.time() - start) >= timeout:
                    raise  # This re-raises the last exception.
                try:
                    return function(*args, **kwargs)
                except wait_on as e:
                    log.warn("Exception %r raised from %r.  Retry in %rs" % (
                        e, function, interval))
                    time.sleep(interval)
        return inner
    return wrapper


def at_concurrent_component_threshold(config, session):
    """
    Determines if the number of concurrent component builds has reached
    the configured threshold
    :param config: Module Build Service configuration object
    :param session: SQLAlchemy database session
    :return: boolean representing if there are too many concurrent builds at
    this time
    """

    # We must not check it for "mock" backend.
    # It would lead to multiple calls of continue_batch_build method and
    # creation of multiple worker threads there. Mock backend uses thread-id
    # to create and identify mock buildroot and for mock backend, we must
    # build whole module in this single continue_batch_build call to keep
    # the number of created buildroots low. The concurrent build limit
    # for mock backend is secured by setting max_workers in
    # ThreadPoolExecutor to num_concurrent_builds.
    if conf.system == "mock":
        return False

    import koji  # Placed here to avoid py2/py3 conflicts...

    if config.num_concurrent_builds and config.num_concurrent_builds <= \
        session.query(models.ComponentBuild).filter_by(
            state=koji.BUILD_STATES['BUILDING'],
            # Components which are reused should not be counted in, because
            # we do not submit new build for them. They are in BUILDING state
            # just internally in MBS to be handled by
            # scheduler.handlers.components.complete.
            reused_component_id=None).count():
        return True

    return False


def start_build_component(builder, c):
    """
    Submits single component build to builder. Called in thread
    by QueueBasedThreadPool in continue_batch_build.
    """
    import koji
    try:
        c.task_id, c.state, c.state_reason, c.nvr = builder.build(
            artifact_name=c.package, source=c.scmurl)
    except Exception as e:
        c.state = koji.BUILD_STATES['FAILED']
        c.state_reason = "Failed to build artifact %s: %s" % (c.package, str(e))
        log.exception(e)
        return

    if not c.task_id and c.state == koji.BUILD_STATES['BUILDING']:
        c.state = koji.BUILD_STATES['FAILED']
        c.state_reason = ("Failed to build artifact %s: "
                          "Builder did not return task ID" % (c.package))
        return


def continue_batch_build(config, module, session, builder, components=None):
    """
    Continues building current batch. Submits next components in the batch
    until it hits concurrent builds limit.

    Returns list of BaseMessage instances which should be scheduled by the
    scheduler.
    """
    import koji  # Placed here to avoid py2/py3 conflicts...

    # The user can either pass in a list of components to 'seed' the batch, or
    # if none are provided then we just select everything that hasn't
    # successfully built yet or isn't currently being built.
    unbuilt_components = components or [
        c for c in module.component_builds
        if (c.state != koji.BUILD_STATES['COMPLETE'] and
            c.state != koji.BUILD_STATES['BUILDING'] and
            c.state != koji.BUILD_STATES['FAILED'] and
            c.batch == module.batch)
    ]

    if not unbuilt_components:
        log.debug("Cannot continue building module %s. No component to build." % module)
        return []

    # Get the list of components to be built in this batch. We are not building
    # all `unbuilt_components`, because we can meet the num_concurrent_builds
    # threshold
    further_work = []
    components_to_build = []
    # Sort the unbuilt_components so that the components that take the longest to build are
    # first
    log.info('Sorting the unbuilt components by their average build time')
    unbuilt_components.sort(key=lambda c: builder.get_average_build_time(c), reverse=True)
    log.info('Done sorting the unbuilt components by their average build time')

    for c in unbuilt_components:
        # Check the concurrent build threshold.
        if at_concurrent_component_threshold(config, session):
            log.info('Concurrent build threshold met')
            break

        # We set state to "BUILDING" here because at this point we are committed
        # to build the component and at_concurrent_component_threshold() works by
        # counting the number of components in the "BUILDING" state.
        c.state = koji.BUILD_STATES['BUILDING']
        components_to_build.append(c)

    # Start build of components in this batch.
    max_workers = 1
    if config.num_concurrent_builds > 0:
        max_workers = config.num_concurrent_builds
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(start_build_component, builder, c):
                   c for c in components_to_build}
        concurrent.futures.wait(futures)
        # In case there has been an excepion generated directly in the
        # start_build_component, the future.result() will re-raise it in the
        # main thread so it is not lost.
        for future in futures:
            future.result()

    # If all components in this batch are already done, it can mean that they
    # have been built in the past and have been skipped in this module build.
    # We therefore have to generate fake KojiRepoChange message, because the
    # repo has been also done in the past and build system will not send us
    # any message now.
    if (all(c.state in [koji.BUILD_STATES['COMPLETE'],
                        koji.BUILD_STATES['FAILED']] or c.reused_component_id
            for c in unbuilt_components) and builder.module_build_tag):
        further_work += [module_build_service.messaging.KojiRepoChange(
            'start_build_batch: fake msg', builder.module_build_tag['name'])]

    session.commit()
    return further_work


def start_next_batch_build(config, module, session, builder, components=None):
    """
    Tries to start the build of next batch. In case there are still unbuilt
    components in a batch, tries to submit more components until it hits
    concurrent builds limit. Otherwise Increments module.batch and submits component
    builds from the next batch.

    :return: a list of BaseMessage instances to be handled by the MBSConsumer.
    """

    import koji  # Placed here to avoid py2/py3 conflicts...

    # Check the status of the module build and current batch so we can
    # later decide if we can start new batch or not.
    has_unbuilt_components = False
    has_unbuilt_components_in_batch = False
    has_building_components_in_batch = False
    has_failed_components = False
    # This is used to determine if it's worth checking if a component can be reused
    # later on in the code
    all_reused_in_prev_batch = True
    for c in module.component_builds:
        if c.state in [None, koji.BUILD_STATES['BUILDING']]:
            has_unbuilt_components = True

            if c.batch == module.batch:
                if not c.state:
                    has_unbuilt_components_in_batch = True
                elif c.state == koji.BUILD_STATES['BUILDING']:
                    has_building_components_in_batch = True
        elif (c.state in [koji.BUILD_STATES['FAILED'],
                          koji.BUILD_STATES['CANCELED']]):
            has_failed_components = True

        if c.batch == module.batch and not c.reused_component_id:
            all_reused_in_prev_batch = False

    # Do not start new batch if there are no components to build.
    if not has_unbuilt_components:
        log.debug("Not starting new batch, there is no component to build "
                  "for module %s" % module)
        return []

    # Check that there is something to build in current batch before starting
    # the new one. If there is, continue building current batch.
    if has_unbuilt_components_in_batch:
        log.info("Continuing building batch %d", module.batch)
        return continue_batch_build(
            config, module, session, builder, components)

    # Check that there are no components in BUILDING state in current batch.
    # If there are, wait until they are built.
    if has_building_components_in_batch:
        log.debug("Not starting new batch, there are still components in "
                  "BUILDING state in current batch for module %s", module)
        return []

    # Check that there are no failed components in this batch. If there are,
    # do not start the new batch.
    if has_failed_components:
        log.info("Not starting new batch, there are failed components for "
                 "module %s", module)
        return []

    # Identify active tasks which might contain relicts of previous builds
    # and fail the module build if this^ happens.
    active_tasks = builder.list_tasks_for_components(module.component_builds,
                                                     state='active')
    if isinstance(active_tasks, list) and active_tasks:
        state_reason = ("Cannot start a batch, because some components are already"
                        " in 'building' state.")
        state_reason += " See tasks (ID): {}".format(
            ', '.join([str(t['id']) for t in active_tasks])
        )
        module.transition(config, state=models.BUILD_STATES['failed'],
                          state_reason=state_reason)
        session.commit()
        return []

    else:
        log.debug("Builder {} doesn't provide information about active tasks."
                  .format(builder))

    # Find out if there is repo regeneration in progress for this module.
    # If there is, wait until the repo is regenerated before starting a new
    # batch.
    artifacts = [c.nvr for c in module.current_batch()]
    if not builder.buildroot_ready(artifacts):
        log.info("Not starting new batch, not all of %r are in the buildroot. "
                 "Waiting." % artifacts)
        return []

    # Although this variable isn't necessary, it is easier to read code later on with it
    prev_batch = module.batch
    module.batch += 1

    # The user can either pass in a list of components to 'seed' the batch, or
    # if none are provided then we just select everything that hasn't
    # successfully built yet or isn't currently being built.
    unbuilt_components = components or [
        c for c in module.component_builds
        if (c.state != koji.BUILD_STATES['COMPLETE'] and
            c.state != koji.BUILD_STATES['BUILDING'] and
            c.state != koji.BUILD_STATES['FAILED'] and
            c.batch == module.batch)
    ]

    # If there are no components to build, skip the batch and start building
    # the new one. This can happen when resubmitting the failed module build.
    if not unbuilt_components and not components:
        log.info("Skipping build of batch %d, no component to build.",
                 module.batch)
        return start_next_batch_build(config, module, session, builder)

    log.info("Starting build of next batch %d, %s" % (module.batch,
             unbuilt_components))

    # Attempt to reuse any components possible in the batch before attempting to build any
    further_work = []
    unbuilt_components_after_reuse = []
    components_reused = False
    should_try_reuse = True
    # If the rebuild strategy is "changed-and-after", try to figure out if it's worth checking if
    # the components can be reused to save on resources
    if module.rebuild_strategy == 'changed-and-after':
        # Check to see if the previous batch had all their builds reused except for when the
        # previous batch was 1 because that always has the module-build-macros component built
        should_try_reuse = all_reused_in_prev_batch or prev_batch == 1
    if should_try_reuse:
        for c in unbuilt_components:
            previous_component_build = get_reusable_component(
                session, module, c.package)

            if previous_component_build:
                components_reused = True
                further_work += reuse_component(c, previous_component_build)
            else:
                unbuilt_components_after_reuse.append(c)
        # Commit the changes done by reuse_component
        if components_reused:
            session.commit()

    # If all the components were reused in the batch then make a KojiRepoChange
    # message and return
    if components_reused and not unbuilt_components_after_reuse:
        further_work.append(module_build_service.messaging.KojiRepoChange(
            'start_build_batch: fake msg', builder.module_build_tag['name']))
        return further_work

    return further_work + continue_batch_build(
        config, module, session, builder, unbuilt_components_after_reuse)


def pagination_metadata(p_query, request_args):
    """
    Returns a dictionary containing metadata about the paginated query.
    This must be run as part of a Flask request.
    :param p_query: flask_sqlalchemy.Pagination object
    :param request_args: a dictionary of the arguments that were part of the
    Flask request
    :return: a dictionary containing metadata about the paginated query
    """
    request_args_wo_page = dict(copy.deepcopy(request_args))
    # Remove pagination related args because those are handled elsewhere
    # Also, remove any args that url_for accepts in case the user entered
    # those in
    for key in ['page', 'per_page', 'endpoint']:
        if key in request_args_wo_page:
            request_args_wo_page.pop(key)
    for key in request_args:
        if key.startswith('_'):
            request_args_wo_page.pop(key)

    pagination_data = {
        'page': p_query.page,
        'pages': p_query.pages,
        'per_page': p_query.per_page,
        'prev': None,
        'next': None,
        'total': p_query.total,
        'first': url_for(request.endpoint, page=1, per_page=p_query.per_page,
                         _external=True, **request_args_wo_page),
        'last': url_for(request.endpoint, page=p_query.pages,
                        per_page=p_query.per_page, _external=True,
                        **request_args_wo_page)
    }

    if p_query.has_prev:
        pagination_data['prev'] = url_for(request.endpoint, page=p_query.prev_num,
                                          per_page=p_query.per_page, _external=True,
                                          **request_args_wo_page)
    if p_query.has_next:
        pagination_data['next'] = url_for(request.endpoint, page=p_query.next_num,
                                          per_page=p_query.per_page, _external=True,
                                          **request_args_wo_page)

    return pagination_data


def _add_order_by_clause(flask_request, query, column_source):
    """
    Orders the given SQLAlchemy query based on the GET arguments provided
    :param flask_request: a Flask request object
    :param query: a SQLAlchemy query object
    :param column_source: a SQLAlchemy database model
    :return: a SQLAlchemy query object
    """
    colname = "id"
    descending = True
    order_desc_by = flask_request.args.get("order_desc_by", None)
    if order_desc_by:
        colname = order_desc_by
    else:
        order_by = flask_request.args.get("order_by", None)
        if order_by:
            colname = order_by
            descending = False

    column = getattr(column_source, colname, None)
    if not column:
        raise ValidationError('An invalid order_by or order_desc_by key '
                              'was supplied')
    if descending:
        column = column.desc()
    return query.order_by(column)


def filter_component_builds(flask_request):
    """
    Returns a flask_sqlalchemy.Pagination object based on the request parameters
    :param request: Flask request object
    :return: flask_sqlalchemy.Pagination
    """
    search_query = dict()
    state = flask_request.args.get('state', None)

    if state:
        if state.isdigit():
            search_query['state'] = state
        else:
            if state in models.BUILD_STATES:
                search_query['state'] = models.BUILD_STATES[state]
            else:
                raise ValidationError('An invalid state was supplied')

    # Lookup module_build from task_id, ref, format, nvr or tagged attribute
    # of a component build.
    for key in ['task_id', 'ref', 'nvr', 'format', 'tagged']:
        if flask_request.args.get(key, None):
            search_query[key] = flask_request.args[key]

    query = models.ComponentBuild.query

    if search_query:
        query = query.filter_by(**search_query)

    query = _add_order_by_clause(flask_request, query, models.ComponentBuild)

    page = flask_request.args.get('page', 1, type=int)
    per_page = flask_request.args.get('per_page', 10, type=int)
    return query.paginate(page, per_page, False)


def filter_module_builds(flask_request):
    """
    Returns a flask_sqlalchemy.Pagination object based on the request parameters
    :param request: Flask request object
    :return: flask_sqlalchemy.Pagination
    """
    search_query = dict()
    state = flask_request.args.get('state', None)

    if state:
        if state.isdigit():
            search_query['state'] = state
        else:
            if state in models.BUILD_STATES:
                search_query['state'] = models.BUILD_STATES[state]
            else:
                raise ValidationError('An invalid state was supplied')

    for key in ['name', 'owner', 'koji_tag']:
        if flask_request.args.get(key, None):
            search_query[key] = flask_request.args[key]

    query = models.ModuleBuild.query

    if search_query:
        query = query.filter_by(**search_query)

    # This is used when filtering the date request parameters, but it is here to avoid recompiling
    utc_iso_datetime_regex = re.compile(
        r'^(?P<datetime>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.\d+)?'
        r'(?:Z|[-+]00(?::00)?)?$')

    # Filter the query based on date request parameters
    for item in ('submitted', 'modified', 'completed'):
        for context in ('before', 'after'):
            request_arg = '%s_%s' % (item, context)  # i.e. submitted_before
            iso_datetime_arg = request.args.get(request_arg, None)

            if iso_datetime_arg:
                iso_datetime_matches = re.match(utc_iso_datetime_regex, iso_datetime_arg)

                if not iso_datetime_matches or not iso_datetime_matches.group('datetime'):
                    raise ValidationError(('An invalid Zulu ISO 8601 timestamp was provided'
                                           ' for the "%s" parameter')
                                          % request_arg)
                # Converts the ISO 8601 string to a datetime object for SQLAlchemy to use to filter
                item_datetime = datetime.strptime(iso_datetime_matches.group('datetime'),
                                                  '%Y-%m-%dT%H:%M:%S')
                # Get the database column to filter against
                column = getattr(models.ModuleBuild, 'time_' + item)

                if context == 'after':
                    query = query.filter(column >= item_datetime)
                elif context == 'before':
                    query = query.filter(column <= item_datetime)

    query = _add_order_by_clause(flask_request, query, models.ModuleBuild)

    page = flask_request.args.get('page', 1, type=int)
    per_page = flask_request.args.get('per_page', 10, type=int)
    return query.paginate(page, per_page, False)


def _fetch_mmd(url, branch=None, allow_local_url=False, whitelist_url=False):
    # Import it here, because SCM uses utils methods
    # and fails to import them because of dep-chain.
    import module_build_service.scm

    yaml = ""
    td = None
    scm = None
    try:
        log.debug('Verifying modulemd')
        td = tempfile.mkdtemp()
        if whitelist_url:
            scm = module_build_service.scm.SCM(url, branch, [url], allow_local_url)
        else:
            scm = module_build_service.scm.SCM(url, branch, conf.scmurls, allow_local_url)
        scm.checkout(td)
        scm.verify()
        cofn = scm.get_module_yaml()

        with open(cofn, "r") as mmdfile:
            yaml = mmdfile.read()
    finally:
        try:
            if td is not None:
                shutil.rmtree(td)
        except Exception as e:
            log.warning(
                "Failed to remove temporary directory {!r}: {}".format(
                    td, str(e)))

    mmd = load_mmd(yaml)

    # If the name was set in the modulemd, make sure it matches what the scmurl
    # says it should be
    if mmd.name and mmd.name != scm.name:
        raise ValidationError('The name "{0}" that is stored in the modulemd '
                              'is not valid'.format(mmd.name))
    else:
        mmd.name = scm.name

    # If the stream was set in the modulemd, make sure it matches what the repo
    # branch is
    if mmd.stream and mmd.stream != scm.branch:
        raise ValidationError('The stream "{0}" that is stored in the modulemd '
                              'does not match the branch "{1}"'.format(
                                  mmd.stream, scm.branch))
    else:
        mmd.stream = str(scm.branch)

    # If the version is in the modulemd, throw an exception since the version
    # is generated by pdc-updater
    if mmd.version:
        raise ValidationError('The version "{0}" is already defined in the '
                              'modulemd but it shouldn\'t be since the version '
                              'is generated based on the commit time'.format(
                                  mmd.version))
    else:
        mmd.version = int(scm.version)


    return mmd, scm


def load_mmd(yaml):
    mmd = modulemd.ModuleMetadata()
    try:
        mmd.loads(yaml)
    except Exception as e:
        log.error('Invalid modulemd: %s' % str(e))
        raise UnprocessableEntity('Invalid modulemd: %s' % str(e))
    return mmd


def _scm_get_latest(pkg):
    try:
        # If the modulemd specifies that the 'f25' branch is what
        # we want to pull from, we need to resolve that f25 branch
        # to the specific commit available at the time of
        # submission (now).
        pkgref = module_build_service.scm.SCM(
            pkg.repository).get_latest(branch=pkg.ref)
    except Exception as e:
        log.exception(e)
        return {
            'error': "Failed to get the latest commit for %s#%s" % (pkg.repository, pkg.ref)
        }

    return {
        'pkg_name': pkg.name,
        'pkg_ref': pkgref,
        'error': None
    }


def load_local_builds(local_build_nsvs, session=None):
    """
    Loads previously finished local module builds from conf.mock_resultsdir
    and imports them to database.

    :param local_build_nsvs: List of NSV separated by ':' defining the modules
        to load from the mock_resultsdir.
    """
    if not local_build_nsvs:
        return

    if not session:
        session = db.session

    if type(local_build_nsvs) != list:
        local_build_nsvs = [local_build_nsvs]

    # Get the list of all available local module builds.
    builds = []
    try:
        for d in os.listdir(conf.mock_resultsdir):
            m = re.match('^module-(.*)-([^-]*)-([0-9]+)$', d)
            if m:
                builds.append((m.group(1), m.group(2), int(m.group(3)), d))
    except OSError:
        pass

    # Sort with the biggest version first
    builds.sort(lambda a, b: -cmp(a[2], b[2]))

    for build_id in local_build_nsvs:
        parts = build_id.split(':')
        if len(parts) < 1 or len(parts) > 3:
            raise RuntimeError(
                'The local build "{0}" couldn\'t be be parsed into '
                'NAME[:STREAM[:VERSION]]'.format(build_id))

        name = parts[0]
        stream = parts[1] if len(parts) > 1 else None
        version = int(parts[2]) if len(parts) > 2 else None

        found_build = None
        for build in builds:
            if name != build[0]:
                continue
            if stream is not None and stream != build[1]:
                continue
            if version is not None and version != build[2]:
                continue

            found_build = build
            break

        if not found_build:
            raise RuntimeError(
                'The local build "{0}" couldn\'t be found in "{1}"'.format(
                    build_id, conf.mock_resultsdir))

        # Load the modulemd metadata.
        path = os.path.join(conf.mock_resultsdir, found_build[3], 'results')
        mmd_path = os.path.join(path, 'modules.yaml')
        with open(mmd_path, 'r') as f:
            mmd_data = yaml.safe_load(f)
        mmd = modulemd.ModuleMetadata()
        mmd.loadd(mmd_data)

        # Create ModuleBuild in database.
        module = models.ModuleBuild.create(
            session,
            conf,
            name=mmd.name,
            stream=mmd.stream,
            version=str(mmd.version),
            modulemd=mmd.dumps(),
            scmurl="",
            username="mbs")
        module.koji_tag = path
        session.commit()

        if (found_build[0] != module.name or found_build[1] != module.stream or
                str(found_build[2]) != module.version):
            raise RuntimeError(
                'Parsed metadata results for "{0}" don\'t match the directory name'
                .format(found_build[3]))
        log.info("Loaded local module build %r", module)


def format_mmd(mmd, scmurl, session=None):
    """
    Prepares the modulemd for the MBS. This does things such as replacing the
    branches of components with commit hashes and adding metadata in the xmd
    dictionary.
    :param mmd: the ModuleMetadata object to format
    :param scmurl: the url to the modulemd
    """
    # Import it here, because SCM uses utils methods and fails to import
    # them because of dep-chain.
    from module_build_service.scm import SCM

    if not session:
        session = db.session

    mmd.xmd['mbs'] = {'scmurl': scmurl, 'commit': None}

    local_modules = models.ModuleBuild.local_modules(session)
    local_modules = {m.name + "-" + m.stream: m for m in local_modules}

    # If module build was submitted via yaml file, there is no scmurl
    if scmurl:
        scm = SCM(scmurl)
        # If a commit hash is provided, add that information to the modulemd
        if scm.commit:
            # We want to make sure we have the full commit hash for consistency
            if SCM.is_full_commit_hash(scm.scheme, scm.commit):
                full_scm_hash = scm.commit
            else:
                full_scm_hash = scm.get_full_commit_hash()

            mmd.xmd['mbs']['commit'] = full_scm_hash
        # If a commit hash wasn't provided then just get the latest from master
        else:
            mmd.xmd['mbs']['commit'] = scm.get_latest()

    pdc = module_build_service.pdc.get_pdc_client_session(conf)

    # Resolve Build-requires.
    if mmd.buildrequires:
        mmd.xmd['mbs']['buildrequires'] = resolve_requires(
            pdc, mmd.buildrequires)
    else:
        mmd.xmd['mbs']['buildrequires'] = {}

    # Resolve Requires.
    if mmd.requires:
        mmd.xmd['mbs']['requires'] = resolve_requires(pdc, mmd.requires)
    else:
        mmd.xmd['mbs']['requires'] = {}

    if mmd.components:
        if 'rpms' not in mmd.xmd['mbs']:
            mmd.xmd['mbs']['rpms'] = {}
        # Add missing data in RPM components
        for pkgname, pkg in mmd.components.rpms.items():
            if pkg.repository and not conf.rpms_allow_repository:
                raise Forbidden(
                    "Custom component repositories aren't allowed.  "
                    "%r bears repository %r" % (pkgname, pkg.repository))
            if pkg.cache and not conf.rpms_allow_cache:
                raise Forbidden(
                    "Custom component caches aren't allowed.  "
                    "%r bears cache %r" % (pkgname, pkg.cache))
            if not pkg.repository:
                pkg.repository = conf.rpms_default_repository + pkgname
            if not pkg.cache:
                pkg.cache = conf.rpms_default_cache + pkgname
            if not pkg.ref:
                pkg.ref = 'master'

        # Add missing data in included modules components
        for modname, mod in mmd.components.modules.items():
            if mod.repository and not conf.modules_allow_repository:
                raise Forbidden(
                    "Custom module repositories aren't allowed.  "
                    "%r bears repository %r" % (modname, mod.repository))
            if not mod.repository:
                mod.repository = conf.modules_default_repository + modname
            if not mod.ref:
                mod.ref = 'master'

        # Check that SCM URL is valid and replace potential branches in
        # pkg.ref by real SCM hash and store the result to our private xmd
        # place in modulemd.
        pool = ThreadPool(20)
        pkg_dicts = pool.map(_scm_get_latest, mmd.components.rpms.values())
        err_msg = ""
        for pkg_dict in pkg_dicts:
            if pkg_dict["error"]:
                err_msg += pkg_dict["error"] + "\n"
            else:
                pkg_name = pkg_dict["pkg_name"]
                pkg_ref = pkg_dict["pkg_ref"]
                mmd.xmd['mbs']['rpms'][pkg_name] = {'ref': pkg_ref}
        if err_msg:
            raise UnprocessableEntity(err_msg)


def validate_mmd(mmd):
    for modname, mod in mmd.components.modules.items():
        if mod.repository and not conf.modules_allow_repository:
            raise Forbidden(
                "Custom module repositories aren't allowed.  "
                "%r bears repository %r" % (modname, mod.repository))


def merge_included_mmd(mmd, included_mmd):
    """
    Merges two modulemds. This merges only metadata which are needed in
    the `main` when it includes another module defined by `included_mmd`
    """
    if 'rpms' in included_mmd.xmd['mbs']:
        if 'rpms' not in mmd.xmd['mbs']:
            mmd.xmd['mbs']['rpms'] = included_mmd.xmd['mbs']['rpms']
        else:
            mmd.xmd['mbs']['rpms'].update(included_mmd.xmd['mbs']['rpms'])


def record_component_builds(mmd, module, initial_batch=1,
                            previous_buildorder=None, main_mmd=None, session=None):
    import koji  # Placed here to avoid py2/py3 conflicts...

    if not session:
        session = db.session

    # Format the modulemd by putting in defaults and replacing streams that
    # are branches with commit hashes
    format_mmd(mmd, module.scmurl, session=session)

    # When main_mmd is set, merge the metadata from this mmd to main_mmd,
    # otherwise our current mmd is main_mmd.
    if main_mmd:
        # Check for components that are in both MMDs before merging since MBS
        # currently can't handle that situation.
        duplicate_components = [rpm for rpm in main_mmd.components.rpms.keys()
                                if rpm in mmd.components.rpms.keys()]
        if duplicate_components:
            error_msg = (
                'The included module "{0}" in "{1}" have the following '
                'conflicting components: {2}'
                .format(mmd.name, main_mmd.name,
                        ', '.join(duplicate_components)))
            raise UnprocessableEntity(error_msg)
        merge_included_mmd(main_mmd, mmd)
    else:
        main_mmd = mmd

    # If the modulemd yaml specifies components, then submit them for build
    if mmd.components:
        components = mmd.components.all
        components.sort(key=lambda x: x.buildorder)

        # We do not start with batch = 0 here, because the first batch is
        # reserved for module-build-macros. First real components must be
        # planned for batch 2 and following.
        batch = initial_batch

        for pkg in components:
            # Increment the batch number when buildorder increases.
            if previous_buildorder != pkg.buildorder:
                previous_buildorder = pkg.buildorder
                batch += 1

            # If the pkg is another module, we fetch its modulemd file
            # and record its components recursively with the initial_batch
            # set to our current batch, so the components of this module
            # are built in the right global order.
            if isinstance(pkg, modulemd.ModuleComponentModule):
                full_url = pkg.repository + "?#" + pkg.ref
                # It is OK to whitelist all URLs here, because the validity
                # of every URL have been already checked in format_mmd(...).
                included_mmd = _fetch_mmd(full_url, whitelist_url=True)[0]
                batch = record_component_builds(included_mmd, module, batch,
                                                previous_buildorder, main_mmd, session=session)
                continue

            pkgref = mmd.xmd['mbs']['rpms'][pkg.name]['ref']
            full_url = pkg.repository + "?#" + pkgref
            build = models.ComponentBuild(
                module_id=module.id,
                package=pkg.name,
                format="rpms",
                scmurl=full_url,
                batch=batch,
                ref=pkgref
            )
            session.add(build)

        return batch


def submit_module_build_from_yaml(username, handle, optional_params=None):
    yaml = handle.read()
    mmd = load_mmd(yaml)

    # Mimic the way how default values are generated for modules that are stored in SCM
    # We can take filename as the module name as opposed to repo name,
    # and also we can take numeric representation of current datetime
    # as opposed to datetime of the last commit
    dt = datetime.utcfromtimestamp(int(time.time()))
    def_name = str(handle.filename.split(".")[0])
    def_version = int(dt.strftime("%Y%m%d%H%M%S"))

    mmd.name = mmd.name or def_name
    mmd.stream = mmd.stream or "master"
    mmd.version = mmd.version or def_version
    return submit_module_build(username, None, mmd, None, optional_params)


_url_check_re = re.compile(r"^[^:/]+:.*$")


def submit_module_build_from_scm(username, url, branch, allow_local_url=False,
                                 skiptests=False, optional_params=None):
    # Translate local paths into file:// URL
    if allow_local_url and not _url_check_re.match(url):
        log.info(
            "'{}' is not a valid URL, assuming local path".format(url))
        url = os.path.abspath(url)
        url = "file://" + url
    mmd, scm = _fetch_mmd(url, branch, allow_local_url)
    if skiptests:
        mmd.buildopts.rpms.macros += "\n\n%__spec_check_pre exit 0\n"
    return submit_module_build(username, url, mmd, scm, optional_params)


def submit_module_build(username, url, mmd, scm, optional_params=None):
    import koji  # Placed here to avoid py2/py3 conflicts...

    # Import it here, because SCM uses utils methods
    # and fails to import them because of dep-chain.
    validate_mmd(mmd)
    module = models.ModuleBuild.query.filter_by(
        name=mmd.name, stream=mmd.stream, version=str(mmd.version)).first()
    if module:
        log.debug('Checking whether module build already exist.')
        if module.state != models.BUILD_STATES['failed']:
            err_msg = ('Module (state=%s) already exists. Only a new build or resubmission of '
                       'a failed build is allowed.' % module.state)
            log.error(err_msg)
            raise Conflict(err_msg)
        if optional_params:
            rebuild_strategy = optional_params.get('rebuild_strategy')
            if rebuild_strategy and module.rebuild_strategy != rebuild_strategy:
                raise ValidationError('You cannot change the module\'s "rebuild_strategy" when '
                                      'resuming a module build')
        log.debug('Resuming existing module build %r' % module)
        # Reset all component builds that didn't complete
        for component in module.component_builds:
            if component.state and component.state != koji.BUILD_STATES['COMPLETE']:
                component.state = None
                db.session.add(component)
        module.username = username
        prev_state = module.previous_non_failed_state
        if prev_state == models.BUILD_STATES['init']:
            transition_to = models.BUILD_STATES['init']
        else:
            transition_to = models.BUILD_STATES['wait']
            module.batch = 0
        module.transition(conf, transition_to, "Resubmitted by %s" % username)
        log.info("Resumed existing module build in previous state %s"
                 % module.state)
    else:
        log.debug('Creating new module build')
        module = models.ModuleBuild.create(
            db.session,
            conf,
            name=mmd.name,
            stream=mmd.stream,
            version=str(mmd.version),
            modulemd=mmd.dumps(),
            scmurl=url,
            username=username,
            **(optional_params or {})
        )

    db.session.add(module)
    db.session.commit()
    log.info("%s submitted build of %s, stream=%s, version=%s", username,
             mmd.name, mmd.stream, mmd.version)
    return module


def scm_url_schemes(terse=False):
    """
    Definition of URL schemes supported by both frontend and scheduler.

    NOTE: only git URLs in the following formats are supported atm:
        git://
        git+http://
        git+https://
        git+rsync://
        http://
        https://
        file://

    :param terse=False: Whether to return terse list of unique URL schemes
                        even without the "://".
    """

    scm_types = {
        "git": ("git://", "git+http://", "git+https://",
                "git+rsync://", "http://", "https://", "file://")
    }

    if not terse:
        return scm_types
    else:
        scheme_list = []
        for scm_type, scm_schemes in scm_types.items():
            scheme_list.extend([scheme[:-3] for scheme in scm_schemes])
        return list(set(scheme_list))


def get_scm_url_re():
    schemes_re = '|'.join(map(re.escape, scm_url_schemes(terse=True)))
    return re.compile(
        r"(?P<giturl>(?:(?P<scheme>(" + schemes_re + r"))://(?P<host>[^/]+))?"
        r"(?P<repopath>/[^\?]+))\?(?P<modpath>[^#]*)#(?P<revision>.+)")


def module_build_state_from_msg(msg):
    state = int(msg.module_build_state)
    # TODO better handling
    assert state in models.BUILD_STATES.values(), (
        'state=%s(%s) is not in %s'
        % (state, type(state), list(models.BUILD_STATES.values())))
    return state


def reuse_component(component, previous_component_build,
                    change_state_now=False):
    """
    Reuses component build `previous_component_build` instead of building
    component `component`

    Returns the list of BaseMessage instances to be handled later by the
    scheduler.
    """

    import koji

    log.info(
        'Reusing component "{0}" from a previous module '
        'build with the nvr "{1}"'.format(
            component.package, previous_component_build.nvr))
    component.reused_component_id = previous_component_build.id
    component.task_id = previous_component_build.task_id
    if change_state_now:
        component.state = previous_component_build.state
    else:
        # Use BUILDING state here, because we want the state to change to
        # COMPLETE by the fake KojiBuildChange message we are generating
        # few lines below. If we would set it to the right state right
        # here, we would miss the code path handling the KojiBuildChange
        # which works only when switching from BUILDING to COMPLETE.
        component.state = koji.BUILD_STATES['BUILDING']
    component.state_reason = \
        'Reused component from previous module build'
    component.nvr = previous_component_build.nvr
    nvr_dict = kobo.rpmlib.parse_nvr(component.nvr)
    # Add this message to further_work so that the reused
    # component will be tagged properly
    return [
        module_build_service.messaging.KojiBuildChange(
            msg_id='reuse_component: fake msg',
            build_id=None,
            task_id=component.task_id,
            build_new_state=previous_component_build.state,
            build_name=component.package,
            build_version=nvr_dict['version'],
            build_release=nvr_dict['release'],
            module_build_id=component.module_id,
            state_reason=component.state_reason
        )
    ]


def attempt_to_reuse_all_components(builder, session, module):
    """
    Tries to reuse all the components in a build. The components are also
    tagged to the tags using the `builder`.

    Returns True if all components could be reused, otherwise False. When
    False is returned, no component has been reused.
    """

    # [(component, component_to_reuse), ...]
    component_pairs = []

    # Find out if we can reuse all components and cache component and
    # component to reuse pairs.
    for c in module.component_builds:
        if c.package == "module-build-macros":
            continue
        component_to_reuse = get_reusable_component(
            session, module, c.package)
        if not component_to_reuse:
            return False

        component_pairs.append((c, component_to_reuse))

    # Stores components we will tag to buildroot and final tag.
    components_to_tag = []

    # Reuse all components.
    for c, component_to_reuse in component_pairs:
        # Set the module.batch to the last batch we have.
        if c.batch > module.batch:
            module.batch = c.batch

        # Reuse the component
        reuse_component(c, component_to_reuse, True)
        components_to_tag.append(c.nvr)

    # Tag them
    builder.buildroot_add_artifacts(components_to_tag, install=False)
    builder.tag_artifacts(components_to_tag)

    return True


def get_reusable_component(session, module, component_name):
    """
    Returns the component (RPM) build of a module that can be reused
    instead of needing to rebuild it
    :param session: SQLAlchemy database session
    :param module: the ModuleBuild object of module being built with a formatted
    mmd
    :param component_name: the name of the component (RPM) that you'd like to
    reuse a previous build of
    :return: the component (RPM) build SQLAlchemy object, if one is not found,
    None is returned
    """

    # We support components reusing only for koji and test backend.
    if conf.system not in ['koji', 'test']:
        return None

    # If the rebuild strategy is "all", that means that nothing can be reused
    if module.rebuild_strategy == 'all':
        log.info('Cannot re-use the component because the rebuild strategy is "all".')
        return None

    mmd = module.mmd()
    # Find the latest module that is in the done or ready state
    previous_module_build = session.query(models.ModuleBuild)\
        .filter_by(name=mmd.name)\
        .filter_by(stream=mmd.stream)\
        .filter(models.ModuleBuild.state.in_([3, 5]))\
        .filter(models.ModuleBuild.scmurl.isnot(None))\
        .order_by(models.ModuleBuild.time_completed.desc())
    # If we are rebuilding with the "changed-and-after" option, then we can't reuse
    # components from modules that were built more liberally
    if module.rebuild_strategy == 'changed-and-after':
        previous_module_build = previous_module_build.filter(
            models.ModuleBuild.rebuild_strategy.in_(['all', 'changed-and-after']))
    previous_module_build = previous_module_build.first()
    # The component can't be reused if there isn't a previous build in the done
    # or ready state
    if not previous_module_build:
        log.info("Cannot re-use.  %r is the first module build." % module)
        return None

    old_mmd = previous_module_build.mmd()

    # Perform a sanity check to make sure that the buildrequires are the same
    # as the buildrequires in xmd for the passed in mmd
    if set(mmd.buildrequires.keys()) != set(mmd.xmd['mbs']['buildrequires'].keys()):
        log.error(
            'Cannot re-use.  The submitted module "{0}" has different keys in '
            'mmd.buildrequires than in '
            'mmd.xmd[\'mbs\'][\'buildrequires\']'.format(mmd.name))
        return None
    # Perform a sanity check to make sure that the buildrequires are the same
    # as the buildrequires in xmd for the mmd of the previous module build
    if set(old_mmd.buildrequires.keys()) != \
            set(old_mmd.xmd['mbs']['buildrequires'].keys()):
        log.error(
            'Cannot re-use.  Version "{0}" of the module "{1}" has different '
            'keys in mmd.buildrequires than in '
            'mmd.xmd[\'mbs\'][\'buildrequires\']'
            .format(previous_module_build.version, previous_module_build.name))
        return None

    # If the chosen component for some reason was not found in the database,
    # or the ref is missing, something has gone wrong and the component cannot
    # be reused
    new_module_build_component = models.ComponentBuild.from_component_name(
        session, component_name, module.id)
    if not new_module_build_component or not new_module_build_component.batch \
            or not new_module_build_component.ref:
        log.info('Cannot re-use.  New component not found in the db.')
        return None

    prev_module_build_component = models.ComponentBuild.from_component_name(
        session, component_name, previous_module_build.id)
    # If the component to reuse for some reason was not found in the database,
    # or the ref is missing, something has gone wrong and the component cannot
    # be reused
    if not prev_module_build_component or not prev_module_build_component.batch\
            or not prev_module_build_component.ref:
        log.info('Cannot re-use.  Previous component not found in the db.')
        return None

    # Make sure the ref for the component that is trying to be reused
    # hasn't changed since the last build
    if prev_module_build_component.ref != new_module_build_component.ref:
        log.info('Cannot re-use.  Component commit hashes do not match.')
        return None

    # At this point we've determined that both module builds contain the component
    # and the components share the same commit hash
    if module.rebuild_strategy == 'changed-and-after':
        # Make sure the batch number for the component that is trying to be reused
        # hasn't changed since the last build
        if prev_module_build_component.batch != new_module_build_component.batch:
            log.info('Cannot re-use.  Batch numbers do not match.')
            return None

        # If the mmd.buildopts.macros.rpms changed, we cannot reuse
        modulemd_macros = ""
        old_modulemd_macros = ""
        if mmd.buildopts and mmd.buildopts.rpms:
            modulemd_macros = mmd.buildopts.rpms.macros
        if old_mmd.buildopts and old_mmd.buildopts.rpms:
            modulemd_macros = old_mmd.buildopts.rpms.macros
        if modulemd_macros != old_modulemd_macros:
            log.info('Cannot re-use.  Old modulemd macros do not match the new.')
            return None

        # If the module buildrequires are different, then we can't reuse the
        # component
        if mmd.buildrequires.keys() != old_mmd.buildrequires.keys():
            log.info('Cannot re-use.  The set of module buildrequires changed')
            return None

        # Make sure that the module buildrequires commit hashes are exactly the same
        for br_module_name, br_module in \
                mmd.xmd['mbs']['buildrequires'].items():
            # Assumes that the streams have been replaced with commit hashes, so we
            # can compare to see if they have changed. Since a build is unique to
            # a commit hash, this is a safe test.
            ref1 = br_module.get('ref')
            ref2 = old_mmd.xmd['mbs']['buildrequires'][br_module_name].get('ref')
            if not (ref1 and ref2) or ref1 != ref2:
                log.info('Cannot re-use.  The module buildrequires hashes changed')
                return None

        # At this point we've determined that both module builds contain the component
        # with the same commit hash and they are in the same batch. We've also determined
        # that both module builds depend(ed) on the same exact module builds. Now it's time
        # to determine if the components before it have changed.
        #
        # Convert the component_builds to a list and sort them by batch
        new_component_builds = list(module.component_builds)
        new_component_builds.sort(key=lambda x: x.batch)
        prev_component_builds = list(previous_module_build.component_builds)
        prev_component_builds.sort(key=lambda x: x.batch)

        new_module_build_components = []
        previous_module_build_components = []
        # Create separate lists for the new and previous module build. These lists
        # will have an entry for every build batch *before* the component's
        # batch except for 1, which is reserved for the module-build-macros RPM.
        # Each batch entry will contain a set of "(name, ref)" with the name and
        # ref (commit) of the component.
        for i in range(new_module_build_component.batch - 1):
            # This is the first batch which we want to skip since it will always
            # contain only the module-build-macros RPM and it gets built every time
            if i == 0:
                continue

            new_module_build_components.append(set([
                (value.package, value.ref) for value in
                new_component_builds if value.batch == i + 1
            ]))

            previous_module_build_components.append(set([
                (value.package, value.ref) for value in
                prev_component_builds if value.batch == i + 1
            ]))

        # If the previous batches don't have the same ordering and hashes, then the
        # component can't be reused
        if previous_module_build_components != new_module_build_components:
            log.info('Cannot re-use.  Ordering or commit hashes of '
                     'previous batches differ.')
            return None

    reusable_component = models.ComponentBuild.query.filter_by(
        package=component_name, module_id=previous_module_build.id).one()
    log.debug('Found reusable component!')
    return reusable_component


def validate_koji_tag(tag_arg_names, pre='', post='-', dict_key='name'):
    """
    Used as a decorator validates koji tag arg(s)' value(s)
    against configurable list of koji tag prefixes.
    Supported arg value types are: dict, list, str

    :param tag_arg_names: Str or list of parameters to validate.
    :param pre: Prepend this optional string (e.g. '.' in case of disttag
    validation) to each koji tag prefix.
    :param post: Append this string/delimiter ('-' by default) to each koji
    tag prefix.
    :param dict_key: In case of a dict arg, inspect this key ('name' by default).
    """

    if not isinstance(tag_arg_names, list):
        tag_arg_names = [tag_arg_names]

    def validation_decorator(function):
        def wrapper(*args, **kwargs):
            call_args = inspect.getcallargs(function, *args, **kwargs)

            for tag_arg_name in tag_arg_names:
                err_subject = "Koji tag validation:"

                # If any of them don't appear in the function, then fail.
                if tag_arg_name not in call_args:
                    raise ProgrammingError(
                        '{} Inspected argument {} is not within function args.'
                        ' The function was: {}.'
                        .format(err_subject, tag_arg_name, function.__name__))

                tag_arg_val = call_args[tag_arg_name]

                # First, check that we have some value
                if not tag_arg_val:
                    raise ValidationError('{} Can not validate {}. No value provided.'
                                          .format(err_subject, tag_arg_name))

                # If any of them are a dict, then use the provided dict_key
                if isinstance(tag_arg_val, dict):
                    if dict_key not in tag_arg_val:
                        raise ProgrammingError(
                            '{} Inspected dict arg {} does not contain {} key.'
                            ' The function was: {}.'
                            .format(err_subject, tag_arg_name, dict_key, function.__name__))
                    tag_list = [tag_arg_val[dict_key]]
                elif isinstance(tag_arg_val, list):
                    tag_list = tag_arg_val
                else:
                    tag_list = [tag_arg_val]

                # Check to make sure the provided values match our whitelist.
                for allowed_prefix in conf.koji_tag_prefixes:
                    if all([t.startswith(pre + allowed_prefix + post) for t in tag_list]):
                        break
                else:
                    # Only raise this error if the given tags don't start with
                    # *any* of our allowed prefixes.
                    raise ValidationError(
                        'Koji tag validation: {} does not satisfy any of allowed prefixes: {}'
                        .format(tag_list,
                                [pre + p + post for p in conf.koji_tag_prefixes]))

            # Finally.. after all that validation, call the original function
            # and return its value.
            return function(*args, **kwargs)

        # We're replacing the original function with our synthetic wrapper,
        # but dress it up to make it look more like the original function.
        wrapper.__name__ = function.__name__
        wrapper.__doc__ = function.__doc__
        return wrapper

    return validation_decorator


def get_rpm_release_from_mmd(mmd):
    """
    Returns the dist tag based on the modulemd metadata and MBS configuration.
    """

    if not mmd.name or not mmd.stream or not mmd.version:
        raise ValueError("Modulemd name, stream, and version are required.")

    dist_str = '.'.join([mmd.name, mmd.stream, str(mmd.version)])
    dist_hash = hashlib.sha1(dist_str).hexdigest()[:8]
    return conf.default_dist_tag_prefix + dist_hash


def create_dogpile_key_generator_func(skip_first_n_args=0):
    """
    Creates dogpile key_generator function with additional features:

    - when models.ModuleBuild is an argument of method cached by dogpile-cache,
      the ModuleBuild.id is used as a key. Therefore it is possible to cache
      data per particular module build, while normally, it would be per
      ModuleBuild.__str__() output, which contains also batch and other data
      which changes during the build of a module.
    - it is able to skip first N arguments of a cached method. This is useful
      when the db.session or PDCClient instance is part of cached method call,
      and the caching should work no matter what session instance is passed
      to cached method argument.
    """
    def key_generator(namespace, fn):
        fname = fn.__name__

        def generate_key(*arg, **kwarg):
            key_template = fname + "_"
            for s in arg[skip_first_n_args:]:
                if type(s) == models.ModuleBuild:
                    key_template += str(s.id)
                else:
                    key_template += str(s) + "_"
            return key_template

        return generate_key
    return key_generator