diff --git a/module_build_service/scheduler/producer.py b/module_build_service/scheduler/producer.py index 116d035d..daec7f65 100644 --- a/module_build_service/scheduler/producer.py +++ b/module_build_service/scheduler/producer.py @@ -26,6 +26,7 @@ fedmsg-hub. This class polls the database for tasks to do. import koji import operator +import time from datetime import timedelta from sqlalchemy.orm import lazyload from moksha.hub.api.producer import PollingProducer @@ -40,6 +41,14 @@ from module_build_service.builder import GenericBuilder class MBSProducer(PollingProducer): frequency = timedelta(seconds=conf.polling_interval) + NEW_REPO_TIMEOUT = 20 * 60 # 20 minutes + + def __init__(self, hub): + super(MBSProducer, self).__init__(hub) + + # Modules waiting on repo with time when we found that out. + self._waiting_for_repo = {} + def poll(self): with models.make_session(conf) as session: self.log_summary(session) @@ -48,6 +57,7 @@ class MBSProducer(PollingProducer): self.process_open_component_builds(session) self.fail_lost_builds(session) self.process_paused_module_builds(conf, session) + self.trigger_new_repo_when_staled(conf, session) log.info('Poller will now sleep for "{}" seconds' .format(conf.polling_interval)) @@ -74,6 +84,14 @@ class MBSProducer(PollingProducer): if not component_build.task_id: continue + # Don't check tasks for components which have been reused, + # they may have BUILDING state temporarily before we tag them + # to new module tag. Checking them would be waste of resources. + if component_build.reused_component_id: + log.debug('Skipping check for task "{0}", ' + 'the component has been reused.'.format(task_id)) + continue + task_id = component_build.task_id log.info('Checking status of task_id "{0}"'.format(task_id)) @@ -106,7 +124,7 @@ class MBSProducer(PollingProducer): if task_info['state'] in state_mapping: # Fake a fedmsg message on our internal queue msg = module_build_service.messaging.KojiBuildChange( - msg_id='a faked internal message', + msg_id='producer::fail_lost_builds fake msg', build_id=component_build.task_id, task_id=component_build.task_id, build_name=component_build.package, @@ -194,3 +212,49 @@ class MBSProducer(PollingProducer): if module_build_service.utils.at_concurrent_component_threshold( config, session): break + + def trigger_new_repo_when_staled(self, config, session): + """ + Sometimes the Koji repo regeneration stays in "init" state without + doing anything and our module build stucks. In case the module build + gets stuck on that, we trigger newRepo again to rebuild it. + """ + if config.system != 'koji': + return + + # Used to remove modules which finished the build before two runs + # of this method. + checked_modules = [] + + for module_build in session.query(models.ModuleBuild).filter_by( + state=models.BUILD_STATES['build']).all(): + checked_modules.append(module_build.id) + if module_build.current_batch(koji.BUILD_STATES['BUILDING']): + # There are some components building, so in case this module + # has been marked as 'waiting for repo', it is no longer true. + if module_build.id in self._waiting_for_repo: + log.info("Removing module %r from the list of modules " + "which are waiting on repo regeneration. It has " + "components in 'build' state.", module_build) + del self._waiting_for_repo[module_build.id] + else: + if module_build.id not in self._waiting_for_repo: + log.info("Adding module %r to list of modules waiting for " + "repo-regen.", module_build) + self._waiting_for_repo[module_build.id] = time.time() + + staled_since = self._waiting_for_repo[module_build.id] + if staled_since + self.NEW_REPO_TIMEOUT < time.time(): + log.info("Triggering repo-regen for module %r. Kojira " + "failed to create new repo in given time.", module_build) + koji_session = module_build_service.builder.KojiModuleBuilder\ + .get_session(config, None) + + taginfo = koji_session.getTag(module_build.koji_tag + "-build") + koji_session.newRepo(taginfo['name']) + del self._waiting_for_repo[module_build.id] + + # Clean finished builds from the _waiting_for_repo. + for module_id in self._waiting_for_repo.keys(): + if module_id not in checked_modules: + del self._waiting_for_repo[module_id] diff --git a/tests/test_scheduler/test_poller.py b/tests/test_scheduler/test_poller.py index 71bf15fd..6aed9e61 100644 --- a/tests/test_scheduler/test_poller.py +++ b/tests/test_scheduler/test_poller.py @@ -35,6 +35,7 @@ import module_build_service.scheduler.handlers.components from module_build_service.builder import GenericBuilder, KojiModuleBuilder from module_build_service.scheduler.producer import MBSProducer import six.moves.queue as queue +import time BASE_DIR = path.abspath(path.dirname(__file__)) CASSETTES_DIR = path.join( @@ -88,3 +89,101 @@ class TestPoller(unittest.TestCase): components = module_build.current_batch() for component in components: self.assertEqual(component.state, koji.BUILD_STATES["BUILDING"]) + + def test_trigger_new_repo_when_staled(self, crete_builder, + koji_get_session, global_consumer, + dbg): + """ + Tests that we call koji_sesion.newRepo when module build is staled. + """ + consumer = mock.MagicMock() + consumer.incoming = queue.Queue() + global_consumer.return_value = consumer + + koji_session = mock.MagicMock() + koji_session.getTag = lambda tag_name: {'name': tag_name} + koji_get_session.return_value = koji_session + + builder = mock.MagicMock() + builder.buildroot_ready.return_value = False + crete_builder.return_value = builder + + # Change the batch to 2, so the module build is in state where + # it is not building anything, but the state is "build". + module_build = models.ModuleBuild.query.filter_by(id=2).one() + module_build.batch = 2 + components = module_build.current_batch() + for component in components: + component.state = koji.BUILD_STATES["COMPLETE"] + db.session.commit() + + hub = mock.MagicMock() + poller = MBSProducer(hub) + poller.poll() + + # newRepo should not be called right now, because the timeout is + # not reached yet. + self.assertTrue(not koji_session.newRepo.called) + + # Try again after 25 minutes, newRepo should be called + with patch("time.time", return_value = time.time() + 25 * 60): + poller.poll() + koji_session.newRepo.assert_called_once_with("module-testmodule-build") + + koji_session.newRepo.reset_mock() + + # Try again after 35 minutes, newRepo should not be called + with patch("time.time", return_value = time.time() + 35 * 60): + poller.poll() + self.assertTrue(not koji_session.newRepo.called) + + # Change module state to ready, it should be removed from the list + # of modules waiting for repo + module_build = models.ModuleBuild.query.filter_by(id=2).one() + module_build.state = 5 + db.session.commit() + + self.assertEqual(len(poller._waiting_for_repo), 1) + poller.poll() + self.assertEqual(len(poller._waiting_for_repo), 0) + + def test_trigger_new_repo_when_staled_kojira_managed_that( + self, crete_builder, koji_get_session, global_consumer, dbg): + """ + Tests that we do not call koji_sesion.newRepo when module build was + stalled but kojira managed to rebuild the repo in time. + """ + consumer = mock.MagicMock() + consumer.incoming = queue.Queue() + global_consumer.return_value = consumer + + koji_session = mock.MagicMock() + koji_session.getTag = lambda tag_name: {'name': tag_name} + koji_get_session.return_value = koji_session + + builder = mock.MagicMock() + builder.buildroot_ready.return_value = False + crete_builder.return_value = builder + + # Change the batch to 2, so the module build is in state where + # it is not building anything, but the state is "build". + module_build = models.ModuleBuild.query.filter_by(id=2).one() + module_build.batch = 2 + components = module_build.current_batch() + for component in components: + component.state = koji.BUILD_STATES["COMPLETE"] + db.session.commit() + + hub = mock.MagicMock() + poller = MBSProducer(hub) + poller.poll() + + module_build.batch = 3 + components = module_build.current_batch() + for component in components: + component.state = koji.BUILD_STATES["BUILDING"] + db.session.commit() + + with patch("time.time", return_value = time.time() + 25 * 60): + poller.poll() + self.assertTrue(not koji_session.newRepo.called)