Source code for ironic.drivers.modules.redfish.firmware

#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import time
from urllib.parse import urlparse

from oslo_log import log
from oslo_utils import timeutils
import sushy

from ironic.common import async_steps
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import metrics_utils
from ironic.common import states
from ironic.conductor import periodics
from ironic.conductor import utils as manager_utils
from ironic.conf import CONF
from ironic.drivers import base
from ironic.drivers.modules import deploy_utils
from ironic.drivers.modules.redfish import firmware_utils
from ironic.drivers.modules.redfish import utils as redfish_utils
from ironic import objects

LOG = log.getLogger(__name__)

METRICS = metrics_utils.get_metrics_logger(__name__)

# Temporary field names stored in node.driver_internal_info
BMC_FW_VERSION_BEFORE_UPDATE = 'bmc_fw_version_before_update'
FIRMWARE_REBOOT_REQUESTED = 'firmware_reboot_requested'

# Temporary field names stored in fw_upd/current_update settings dict
NIC_NEEDS_POST_COMPLETION_REBOOT = 'nic_needs_post_completion_reboot'
NIC_STARTING_TIMESTAMP = 'nic_starting_timestamp'
NIC_REBOOT_TRIGGERED = 'nic_reboot_triggered'
BIOS_REBOOT_TRIGGERED = 'bios_reboot_triggered'


[docs] class RedfishFirmware(base.FirmwareInterface): _FW_SETTINGS_ARGSINFO = { 'settings': { 'description': ( 'A list of dicts with firmware components to be updated' ), 'required': True } }
[docs] def get_properties(self): """Return the properties of the interface. :returns: dictionary of <property name>:<property description> entries. """ return redfish_utils.COMMON_PROPERTIES.copy()
[docs] def validate(self, task): """Validates the driver information needed by the redfish driver. :param task: a TaskManager instance containing the node to act on. :raises: InvalidParameterValue on malformed parameter(s) :raises: MissingParameterValue on missing parameter(s) """ redfish_utils.parse_driver_info(task.node)
[docs] @METRICS.timer('RedfishFirmware.cache_firmware_components') def cache_firmware_components(self, task): """Store or update Firmware Components on the given node. This method stores Firmware Components to the firmware_information table during 'cleaning' operation. It will also update the timestamp of each Firmware Component. :param task: a TaskManager instance. :raises: UnsupportedDriverExtension, if the node's driver doesn't support getting Firmware Components from bare metal. """ node_id = task.node.id settings = [] # NOTE(iurygregory): currently we will only retrieve BIOS and BMC # firmware information through the redfish system and manager. system = redfish_utils.get_system(task.node) if system.bios_version: bios_fw = {'component': redfish_utils.BIOS, 'current_version': system.bios_version} settings.append(bios_fw) else: LOG.debug('Could not retrieve BiosVersion in node %(node_uuid)s ' 'system %(system)s', {'node_uuid': task.node.uuid, 'system': system.identity}) # NOTE(iurygregory): normally we only relay on the System to # perform actions, but to retrieve the BMC Firmware we need to # access the Manager. try: manager = redfish_utils.get_manager(task.node, system) if manager.firmware_version: bmc_fw = {'component': redfish_utils.BMC, 'current_version': manager.firmware_version} settings.append(bmc_fw) else: LOG.debug('Could not retrieve FirmwareVersion in node ' '%(node_uuid)s manager %(manager)s', {'node_uuid': task.node.uuid, 'manager': manager.identity}) except exception.RedfishError: LOG.warning('No manager available to retrieve Firmware ' 'from the bmc of node %s', task.node.uuid) nic_components = None try: nic_components = self.retrieve_nic_components(task, system) except (exception.RedfishError, sushy.exceptions.BadRequestError, sushy.exceptions.MissingAttributeError) as e: # NOTE(janders) if an exception is raised, log a warning # with exception details. This is important for HP hardware # which at the time of writing this are known to return 400 # responses to GET NetworkAdapters while OS isn't fully booted LOG.warning('Unable to access NetworkAdapters on node ' '%(node_uuid)s, Error: %(error)s', {'node_uuid': task.node.uuid, 'error': e}) # NOTE(janders) if no exception is raised but no NICs are returned, # state that clearly but in a lower severity message if nic_components == []: LOG.debug('Could not retrieve Firmware Package Version from ' 'NetworkAdapters on node %(node_uuid)s', {'node_uuid': task.node.uuid}) elif nic_components: settings.extend(nic_components) if not settings: error_msg = (_('Cannot retrieve firmware for node %s: no ' 'supported components') % task.node.uuid) LOG.error(error_msg) raise exception.UnsupportedDriverExtension(error_msg) create_list, update_list, nochange_list = ( objects.FirmwareComponentList.sync_firmware_components( task.context, node_id, settings)) if create_list: for new_fw in create_list: new_fw_cmp = objects.FirmwareComponent( task.context, node_id=node_id, component=new_fw['component'], current_version=new_fw['current_version'] ) new_fw_cmp.create() if update_list: for up_fw in update_list: up_fw_cmp = objects.FirmwareComponent.get( task.context, node_id=node_id, name=up_fw['component'] ) up_fw_cmp.last_version_flashed = up_fw.get('current_version') up_fw_cmp.current_version = up_fw.get('current_version') up_fw_cmp.save()
[docs] def retrieve_nic_components(self, task, system): """Helper function to retrieve all NICs components on a given node. :param task: a TaskManager instance. :param system: a Redfish System object :returns: a list of NIC components """ nic_list = [] try: chassis = redfish_utils.get_chassis(task.node, system) except exception.RedfishError: LOG.debug('No chassis available to retrieve NetworkAdapters ' 'firmware information on node %(node_uuid)s', {'node_uuid': task.node.uuid}) return nic_list try: network_adapters = chassis.network_adapters if network_adapters is None: LOG.debug('NetworkAdapters not available on chassis for ' 'node %(node_uuid)s', {'node_uuid': task.node.uuid}) return nic_list adapters = network_adapters.get_members() except sushy.exceptions.MissingAttributeError: LOG.debug('NetworkAdapters not available on chassis for ' 'node %(node_uuid)s', {'node_uuid': task.node.uuid}) return nic_list for net_adp in adapters: for net_adp_ctrl in net_adp.controllers: fw_pkg_v = net_adp_ctrl.firmware_package_version if not fw_pkg_v: continue if net_adp.serial_number: net_adp_id = net_adp.serial_number LOG.debug('Using SerialNumber %(serial_number)s for ' 'NetworkAdapter %(net_adp_id)s', {'serial_number': net_adp.serial_number, 'net_adp_id': net_adp.identity}) else: net_adp_id = net_adp.identity LOG.debug('Using Identity %(identity)s for ' 'NetworkAdapter %(net_adp_id)s', {'identity': net_adp.identity, 'net_adp_id': net_adp.identity}) net_adp_fw = {'component': redfish_utils.NIC_COMPONENT_PREFIX + net_adp_id, 'current_version': fw_pkg_v} nic_list.append(net_adp_fw) return nic_list
[docs] @METRICS.timer('RedfishFirmware.update') @base.deploy_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO) @base.clean_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO, requires_ramdisk=True) @base.service_step(priority=0, abortable=False, argsinfo=_FW_SETTINGS_ARGSINFO, requires_ramdisk=False) def update(self, task, settings): """Update the Firmware on the node using the settings for components. :param task: a TaskManager instance. :param settings: a list of dictionaries, each dictionary contains the component name and the url that will be used to update the firmware. :raises: UnsupportedDriverExtension, if the node's driver doesn't support update via the interface. :raises: InvalidParameterValue, if validation of the settings fails. :raises: MissingParamterValue, if some required parameters are missing. :returns: states.CLEANWAIT if Firmware update with the settings is in progress asynchronously of None if it is complete. """ firmware_utils.validate_firmware_interface_update_args(settings) node = task.node update_service = redfish_utils.get_update_service(node) LOG.debug('Updating Firmware on node %(node_uuid)s with settings ' '%(settings)s', {'node_uuid': node.uuid, 'settings': settings}) self._execute_firmware_update(node, update_service, settings) # Store updated settings and start time for overall timeout tracking node.set_driver_internal_info('redfish_fw_updates', settings) node.set_driver_internal_info( 'redfish_fw_update_start_time', timeutils.utcnow().isoformat()) node.save() # Return wait state to keep the step active and let polling handle # the monitoring and eventual completion/reboot return async_steps.get_return_state(node)
def _clean_temp_fields(self, node): """Clean up temporary fields used during firmware update monitoring. This ensures no stale data interferes with new firmware updates. :param node: the Ironic node object """ # BMC-related temp fields node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) # General firmware temp fields node.del_driver_internal_info(FIRMWARE_REBOOT_REQUESTED) def _setup_bmc_update_monitoring(self, node, fw_upd): """Set up monitoring for BMC firmware update. BMC updates do not reboot immediately. Instead, we check the BMC version periodically. If the version changed, we continue without reboot. If timeout expires without version change, we trigger a reboot. :param node: the Ironic node object :param fw_upd: firmware update settings dict """ # Clean any stale temp fields from previous updates self._clean_temp_fields(node) # Record current BMC version before update try: system = redfish_utils.get_system(node) manager = redfish_utils.get_manager(node, system) current_bmc_version = manager.firmware_version node.set_driver_internal_info( BMC_FW_VERSION_BEFORE_UPDATE, current_bmc_version) LOG.debug('BMC version before update for node %(node)s: ' '%(version)s', {'node': node.uuid, 'version': current_bmc_version}) except Exception as e: LOG.warning('Could not read BMC version before update for ' 'node %(node)s: %(error)s', {'node': node.uuid, 'error': e}) LOG.info('BMC firmware update for node %(node)s. ' 'Monitoring BMC version instead of immediate reboot.', {'node': node.uuid}) # Use wait_interval or default reboot delay wait_interval = fw_upd.get('wait') if wait_interval is None: wait_interval = CONF.redfish.firmware_update_reboot_delay fw_upd['wait'] = wait_interval # Set wait_start_time for polling interval and bmc_check_start_time # for total timeout tracking (wait_start_time gets updated each poll) start_time = str(timeutils.utcnow().isoformat()) fw_upd['wait_start_time'] = start_time fw_upd['bmc_check_start_time'] = start_time # Mark this as a BMC update so we can handle timeouts properly fw_upd['component_type'] = redfish_utils.BMC # BMC: Set async flags without immediate reboot deploy_utils.set_async_step_flags( node, reboot=False, polling=True ) def _setup_nic_update_monitoring(self, node): """Set up monitoring for NIC firmware update. NIC firmware behavior varies by hardware. Some NICs update immediately, some need reboot to start. The handler will wait 30s and decide whether to reboot. :param node: the Ironic node object """ # Clean any stale temp fields from previous updates self._clean_temp_fields(node) LOG.info('NIC firmware update for node %(node)s. Will monitor ' 'task state to determine if reboot is needed.', {'node': node.uuid}) # NIC: Set async flags with reboot enabled # (reboot will be triggered conditionally if hardware needs it) deploy_utils.set_async_step_flags( node, reboot=True, polling=True ) def _setup_bios_update_monitoring(self, node): """Set up monitoring for BIOS firmware update. BIOS updates require a reboot to apply, so we trigger it as soon as the update task begins rather than waiting for completion. :param node: the Ironic node object """ # Clean any stale temp fields from previous updates self._clean_temp_fields(node) LOG.info('BIOS firmware update for node %(node)s. Will reboot ' 'when update task starts.', {'node': node.uuid}) # BIOS: Set async flags with reboot enabled deploy_utils.set_async_step_flags( node, reboot=True, polling=True ) def _setup_default_update_monitoring(self, node, fw_upd): """Set up monitoring for unknown/default firmware component types. Default behavior for unknown component types uses standard reboot handling with configurable wait interval. :param node: the Ironic node object :param fw_upd: firmware update settings dict """ # Clean any stale temp fields from previous updates self._clean_temp_fields(node) component = fw_upd.get('component', '') LOG.warning( 'Unknown component type %(component)s for node %(node)s. ' 'Using default firmware update behavior.', {'component': component, 'node': node.uuid}) wait_interval = fw_upd.get('wait') if wait_interval is None: wait_interval = ( node.driver_info.get('firmware_update_unresponsive_bmc_wait') or CONF.redfish.firmware_update_wait_unresponsive_bmc) fw_upd['wait'] = wait_interval # Default: Set async flags with reboot enabled deploy_utils.set_async_step_flags( node, reboot=True, polling=True ) def _get_current_bmc_version(self, node): """Get current BMC firmware version. Note: BMC may be temporarily unresponsive after firmware update. Expected exceptions (timeouts, connection refused, HTTP errors) are caught and logged, returning None to indicate version unavailable. :param node: the Ironic node object :returns: Current BMC firmware version string, or None if BMC is unresponsive/inaccessible """ try: system = redfish_utils.get_system(node) manager = redfish_utils.get_manager(node, system) return manager.firmware_version except (exception.RedfishError, exception.RedfishConnectionError, sushy.exceptions.SushyError) as e: # BMC unresponsiveness is expected after firmware update # (timeouts, connection refused, HTTP 4xx/5xx errors) LOG.debug('BMC temporarily unresponsive for node %(node)s: ' '%(error)s', {'node': node.uuid, 'error': e}) return None def _handle_bmc_update_completion(self, task, update_service, settings, current_update): """Handle BMC firmware update completion with version checking. For BMC updates, we don't reboot immediately. Instead, we check the BMC version periodically. If the version changed, we continue without reboot. If timeout expires without version change, we trigger a reboot. :param task: a TaskManager instance :param update_service: the sushy firmware update service :param settings: firmware update settings :param current_update: the current firmware update being processed """ node = task.node # Try to get current BMC version # Note: BMC may be unresponsive after firmware update - expected current_version = self._get_current_bmc_version(node) version_before = node.driver_internal_info.get( BMC_FW_VERSION_BEFORE_UPDATE) # If we can read the version and it changed, update is complete if (current_version is not None and version_before is not None and current_version != version_before): LOG.info( 'BMC firmware version for node %(node)s changed from ' '%(old)s to %(new)s. Update complete. Continuing without ' 'reboot.', {'node': node.uuid, 'old': version_before, 'new': current_version}) node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) node.save() self._continue_updates(task, update_service, settings) return # Check if we've been checking for too long check_start_time = current_update.get('bmc_check_start_time') if check_start_time: check_start = timeutils.parse_isotime(check_start_time) elapsed_time = timeutils.utcnow(True) - check_start timeout = current_update.get( 'wait', CONF.redfish.firmware_update_reboot_delay) if elapsed_time.seconds >= timeout: # Timeout: version didn't change or BMC unresponsive if (current_version is not None and version_before is not None and current_version == version_before): # Version didn't change - skip reboot LOG.info( 'BMC firmware version for node %(node)s did not ' 'change (still %(version)s). Update appears to be ' 'a no-op or does not require reboot. Continuing ' 'without reboot.', {'node': node.uuid, 'version': current_version}) else: # Version changed or we can't tell - reboot to apply LOG.warning( 'BMC firmware version check timeout expired for ' 'node %(node)s after %(elapsed)s seconds. ' 'Will reboot to complete firmware update.', {'node': node.uuid, 'elapsed': elapsed_time.seconds}) # Mark that reboot is needed node.set_driver_internal_info( FIRMWARE_REBOOT_REQUESTED, True) # Enable reboot flag now that we're ready to reboot deploy_utils.set_async_step_flags( node, reboot=True, polling=True ) node.del_driver_internal_info(BMC_FW_VERSION_BEFORE_UPDATE) node.save() self._continue_updates(task, update_service, settings) return # Continue checking - set wait to check again wait_interval = ( CONF.redfish.firmware_update_bmc_version_check_interval) current_update['wait'] = wait_interval current_update['wait_start_time'] = str( timeutils.utcnow().isoformat()) current_update['bmc_version_checking'] = True node.set_driver_internal_info('redfish_fw_updates', settings) node.save() LOG.debug('BMC firmware version check continuing for node %(node)s. ' 'Will check again in %(interval)s seconds.', {'node': node.uuid, 'interval': wait_interval}) def _handle_nic_update_completion(self, task, update_service, settings, current_update): """Handle NIC firmware update completion. For NIC updates, check if a reboot is needed based on whether the task went through the Running state (needs reboot after completion) or if reboot already occurred during the Starting phase. :param task: a TaskManager instance :param update_service: the sushy firmware update service :param settings: firmware update settings :param current_update: the current firmware update being processed """ node = task.node # Check if reboot is needed (task went to Running state) needs_reboot = current_update.get( NIC_NEEDS_POST_COMPLETION_REBOOT, False) if needs_reboot: LOG.info( 'NIC firmware update task completed for node ' '%(node)s. Reboot required to apply update.', {'node': node.uuid}) # Mark that reboot is needed node.set_driver_internal_info( FIRMWARE_REBOOT_REQUESTED, True) # Clean up flags current_update.pop(NIC_NEEDS_POST_COMPLETION_REBOOT, None) current_update.pop(NIC_STARTING_TIMESTAMP, None) current_update.pop(NIC_REBOOT_TRIGGERED, None) else: LOG.info( 'NIC firmware update task completed for node ' '%(node)s. Reboot already occurred during update ' 'start.', {'node': node.uuid}) # Clean up all NIC-related flags current_update.pop(NIC_STARTING_TIMESTAMP, None) current_update.pop(NIC_REBOOT_TRIGGERED, None) self._continue_updates(task, update_service, settings) def _execute_firmware_update(self, node, update_service, settings): """Executes the next firmware update to the node Executes the first firmware update in the settings list to the node. :param node: the node that will have a firmware update executed. :param update_service: the sushy firmware update service. :param settings: remaining settings for firmware update that needs to be executed. """ fw_upd = settings[0] # Store power timeout to use on reboot operations fw_upd['power_timeout'] = CONF.redfish.firmware_update_reboot_delay # NOTE(janders) try to get the collection of Systems on the BMC # to determine if there may be more than one System try: systems_collection = redfish_utils.get_system_collection(node) except exception.RedfishError as e: LOG.error('Failed getting Redfish Systems Collection' ' for node %(node)s. Error %(error)s', {'node': node.uuid, 'error': e}) raise exception.RedfishError(error=e) count = len(systems_collection.members_identities) # NOTE(janders) if we see more than one System on the BMC, assume that # we need to explicitly specify Target parameter when calling # SimpleUpdate. This is needed for compatibility with sushy-tools # in automated testing using VMs. if count > 1: target = node.driver_info.get('redfish_system_id') targets = [target] else: targets = None component_url, cleanup = self._stage_firmware_file(node, fw_upd) LOG.debug('Applying new firmware %(url)s for %(component)s on node ' '%(node_uuid)s', {'url': fw_upd['url'], 'component': fw_upd['component'], 'node_uuid': node.uuid}) try: if targets is not None: task_monitor = update_service.simple_update(component_url, targets=targets) else: task_monitor = update_service.simple_update(component_url) except sushy.exceptions.MissingAttributeError as e: LOG.error('The attribute #UpdateService.SimpleUpdate is missing ' 'on node %(node)s. Error: %(error)s', {'node': node.uuid, 'error': e.message}) raise exception.RedfishError(error=e) # Store task monitor URI for periodic task polling # NOTE(janders): Component-specific wait/reboot behavior is now # handled by the update() method and periodic polling, not here fw_upd['task_monitor'] = task_monitor.task_monitor_uri node.set_driver_internal_info('redfish_fw_updates', settings) if cleanup: fw_clean = node.driver_internal_info.get('firmware_cleanup') if not fw_clean: fw_clean = [cleanup] elif cleanup not in fw_clean: fw_clean.append(cleanup) node.set_driver_internal_info('firmware_cleanup', fw_clean) component = fw_upd.get('component', '') component_type = redfish_utils.get_component_type(component) if component_type == redfish_utils.BMC: self._setup_bmc_update_monitoring(node, fw_upd) elif component_type == redfish_utils.NIC: self._setup_nic_update_monitoring(node) elif component_type == redfish_utils.BIOS: self._setup_bios_update_monitoring(node) else: self._setup_default_update_monitoring(node, fw_upd) def _validate_resources_stability(self, node): """Validate that BMC resources are consistently available. Requires consecutive successful responses from System, Manager, and NetworkAdapters resources before considering them stable. The number of required successes is configured via CONF.redfish.firmware_update_required_successes. Timeout is configured via CONF.redfish.firmware_update_resource_validation_timeout. :param node: the Ironic node object :raises: RedfishError if resources don't stabilize within timeout """ timeout = CONF.redfish.firmware_update_resource_validation_timeout required_successes = CONF.redfish.firmware_update_required_successes validation_interval = CONF.redfish.firmware_update_validation_interval # Skip validation if validation is disabled via configuration if required_successes == 0 or timeout == 0: reasons = [] if required_successes == 0: reasons.append('required_successes=0') if timeout == 0: reasons.append('validation_timeout=0') LOG.info('BMC resource validation disabled (%s) for node %(node)s', ', '.join(reasons), {'node': node.uuid}) return LOG.debug('Starting resource stability validation for node %(node)s ' '(timeout: %(timeout)s seconds, ' 'required_successes: %(required)s, ' 'validation_interval: %(interval)s seconds)', {'node': node.uuid, 'timeout': timeout, 'required': required_successes, 'interval': validation_interval}) start_time = time.time() end_time = start_time + timeout consecutive_successes = 0 last_exc = None while time.time() < end_time: try: # Test System resource system = redfish_utils.get_system(node) # Test Manager resource redfish_utils.get_manager(node, system) # Test Chassis and NetworkAdapters resource (if available) # Some systems may not have NetworkAdapters, which is valid chassis = redfish_utils.get_chassis(node, system) try: network_adapters = chassis.network_adapters if network_adapters is not None: network_adapters.get_members() except sushy.exceptions.MissingAttributeError: # NetworkAdapters not available is acceptable pass # All resources successful consecutive_successes += 1 LOG.debug('Resource validation success %(count)d/%(required)d ' 'for node %(node)s', {'count': consecutive_successes, 'required': required_successes, 'node': node.uuid}) if consecutive_successes >= required_successes: LOG.info('All tested Redfish resources stable and ' ' available for node %(node)s', {'node': node.uuid}) return except (exception.RedfishError, exception.RedfishConnectionError, sushy.exceptions.SushyError) as e: LOG.debug('BMC resource validation failed for node %(node)s: ' '%(error)s. This may indicate the BMC is still ' 'restarting or recovering from firmware update.', {'node': node.uuid, 'error': e}) # Resource not available yet, reset counter if consecutive_successes > 0: LOG.debug('Resource validation interrupted for node ' '%(node)s, resetting success counter ' '(error: %(error)s)', {'node': node.uuid, 'error': e}) consecutive_successes = 0 last_exc = e # Wait before next validation attempt time.sleep(validation_interval) # Timeout reached without achieving stability error_msg = _('BMC resources failed to stabilize within ' '%(timeout)s seconds for node %(node)s') % { 'timeout': timeout, 'node': node.uuid} if last_exc: error_msg += _(', last error: %(error)s') % {'error': last_exc} LOG.error(error_msg) raise exception.RedfishError(error=error_msg) def _continue_updates(self, task, update_service, settings): """Continues processing the firmware updates Continues to process the firmware updates on the node. First monitors the current task completion, then validates resource stability before proceeding to next update or completion. Note that the caller must have an exclusive lock on the node. :param task: a TaskManager instance containing the node to act on. :param update_service: the sushy firmware update service :param settings: the remaining firmware updates to apply """ node = task.node fw_upd = settings[0] wait_interval = fw_upd.get('wait') if wait_interval: time_now = str(timeutils.utcnow().isoformat()) fw_upd['wait_start_time'] = time_now LOG.debug('Waiting at %(time)s for %(seconds)s seconds after ' '%(component)s firmware update %(url)s ' 'on node %(node)s', {'time': time_now, 'seconds': wait_interval, 'component': fw_upd['component'], 'url': fw_upd['url'], 'node': node.uuid}) node.set_driver_internal_info('redfish_fw_updates', settings) node.save() return if len(settings) == 1: # Last firmware update - check if reboot is needed reboot_requested = node.driver_internal_info.get( FIRMWARE_REBOOT_REQUESTED, False) self._clear_updates(node) LOG.info('Firmware updates completed for node %(node)s', {'node': node.uuid}) # If reboot was requested (e.g., for BMC timeout or NIC # completion), trigger the reboot before notifying conductor if reboot_requested: LOG.info('Rebooting node %(node)s to apply firmware updates', {'node': node.uuid}) manager_utils.node_power_action(task, states.REBOOT) LOG.debug('Validating BMC responsiveness before resuming ' 'conductor operations for node %(node)s', {'node': node.uuid}) self._validate_resources_stability(node) if task.node.clean_step: manager_utils.notify_conductor_resume_clean(task) elif task.node.service_step: manager_utils.notify_conductor_resume_service(task) elif task.node.deploy_step: manager_utils.notify_conductor_resume_deploy(task) else: # Validate BMC resources are stable before continuing next update LOG.info('Validating BMC responsiveness before continuing ' 'to next firmware update for node %(node)s', {'node': node.uuid}) self._validate_resources_stability(node) settings.pop(0) self._execute_firmware_update(node, update_service, settings) node.save() # Only reboot if the component code requested it. if task.node.clean_step: reboot_field = async_steps.CLEANING_REBOOT elif task.node.deploy_step: reboot_field = async_steps.DEPLOYMENT_REBOOT elif task.node.service_step: reboot_field = async_steps.SERVICING_REBOOT else: reboot_field = None # Default to reboot=True for backwards compatibility. should_reboot = (node.driver_internal_info.get(reboot_field, True) if reboot_field else True) if should_reboot: power_timeout = settings[0].get('power_timeout', 0) manager_utils.node_power_action(task, states.REBOOT, power_timeout) else: LOG.debug('Component requested no immediate reboot for node ' '%(node)s. Continuing with async polling.', {'node': node.uuid}) def _clear_updates(self, node): """Clears firmware updates artifacts Clears firmware updates from driver_internal_info and any files that were staged. Note that the caller must have an exclusive lock on the node. :param node: the node to clear the firmware updates from """ firmware_utils.cleanup(node) node.del_driver_internal_info('redfish_fw_updates') node.del_driver_internal_info('redfish_fw_update_start_time') node.del_driver_internal_info('firmware_cleanup') # Clean all temporary fields used during firmware update monitoring self._clean_temp_fields(node) node.save() @METRICS.timer('RedfishFirmware._query_update_failed') @periodics.node_periodic( purpose='checking if async update of firmware component failed', spacing=CONF.redfish.firmware_update_fail_interval, filters={'reserved': False, 'provision_state_in': [states.CLEANFAIL, states.DEPLOYFAIL, states.SERVICEFAIL], 'maintenance': True}, predicate_extra_fields=['driver_internal_info'], predicate=lambda n: n.driver_internal_info.get('redfish_fw_updates'), ) def _query_update_failed(self, task, manager, context): """Periodic job to check for failed firmware updates.""" # A firmware update failed. Discard any remaining firmware # updates so when the user takes the node out of # maintenance mode, pending firmware updates do not # automatically continue. LOG.error('Update firmware failed for node %(node)s. ' 'Discarding remaining firmware updates.', {'node': task.node.uuid}) task.upgrade_lock() self._clear_updates(task.node) @METRICS.timer('RedfishFirmware._query_update_status') @periodics.node_periodic( purpose='checking async update of firmware component', spacing=CONF.redfish.firmware_update_fail_interval, filters={'reserved': False, 'provision_state_in': [states.CLEANWAIT, states.DEPLOYWAIT, states.SERVICEWAIT]}, predicate_extra_fields=['driver_internal_info'], predicate=lambda n: n.driver_internal_info.get('redfish_fw_updates'), ) def _query_update_status(self, task, manager, context): """Periodic job to check firmware update tasks.""" self._check_node_redfish_firmware_update(task) def _handle_task_completion(self, task, sushy_task, messages, update_service, settings, current_update): """Handle firmware update task completion. :param task: a TaskManager instance :param sushy_task: the sushy task object :param messages: list of task messages :param update_service: the sushy firmware update service :param settings: firmware update settings :param current_update: the current firmware update being processed """ node = task.node if (sushy_task.task_state == sushy.TASK_STATE_COMPLETED and sushy_task.task_status in [sushy.HEALTH_OK, sushy.HEALTH_WARNING]): LOG.debug('Redfish task completed for node %(node)s, ' 'firmware %(firmware_image)s: %(messages)s.', {'node': node.uuid, 'firmware_image': current_update['url'], 'messages': ", ".join(messages)}) # Component-specific post-update handling component = current_update.get('component', '') component_type = redfish_utils.get_component_type(component) if component_type == redfish_utils.BMC: # BMC: Start version checking instead of immediate reboot self._handle_bmc_update_completion( task, update_service, settings, current_update) elif component_type == redfish_utils.NIC: # NIC: Handle completion with appropriate reboot behavior self._handle_nic_update_completion( task, update_service, settings, current_update) elif component_type == redfish_utils.BIOS: # BIOS: Check if reboot was actually triggered # Some BMCs (e.g., HPE iLO) complete the BIOS firmware task # very quickly (staging the firmware) before Ironic can poll # and trigger the reboot. In this case, we need to trigger # the reboot now to actually apply the firmware. if not current_update.get(BIOS_REBOOT_TRIGGERED): LOG.info('BIOS firmware update task completed for node ' '%(node)s but reboot was not triggered yet. ' 'Triggering reboot now to apply staged firmware.', {'node': node.uuid}) current_update[BIOS_REBOOT_TRIGGERED] = True node.set_driver_internal_info('redfish_fw_updates', settings) node.save() power_timeout = current_update.get('power_timeout', 0) manager_utils.node_power_action(task, states.REBOOT, power_timeout) return else: # Reboot was already triggered when task started, # just continue with next update LOG.info('BIOS firmware update task completed for node ' '%(node)s. System was already rebooted. ' 'Proceeding with continuation.', {'node': node.uuid}) # Clean up the reboot trigger flag current_update.pop(BIOS_REBOOT_TRIGGERED, None) self._continue_updates(task, update_service, settings) else: # Default: continue as before self._continue_updates(task, update_service, settings) else: error_msg = (_('Firmware update failed for node %(node)s, ' 'firmware %(firmware_image)s. ' 'Error: %(errors)s') % {'node': node.uuid, 'firmware_image': current_update['url'], 'errors': ", ".join(messages)}) self._clear_updates(node) if task.node.clean_step: manager_utils.cleaning_error_handler(task, error_msg) elif task.node.deploy_step: manager_utils.deploying_error_handler(task, error_msg) elif task.node.service_step: manager_utils.servicing_error_handler(task, error_msg) def _handle_nic_task_starting(self, task, task_monitor, settings, current_update): """Handle NIC firmware update task when it starts. NIC firmware behavior varies by hardware: - Some NICs need reboot to START applying (task stays at Starting) - Some NICs can start immediately but need reboot to APPLY (goes to Running, then needs reboot after completion) This method waits for the configured time (CONF.redfish.firmware_update_nic_starting_wait) to determine which type: - If still Starting after wait time → trigger reboot to start - If moves to Running → let it finish, reboot will happen after completion :param task: a TaskManager instance :param task_monitor: the sushy task monitor :param settings: firmware update settings :param current_update: the current firmware update being processed :returns: True if should stop polling, False to continue """ # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() node = task.node try: sushy_task = task_monitor.get_task() task_state = sushy_task.task_state LOG.debug('NIC update task state for node %(node)s: %(state)s', {'node': node.uuid, 'state': task_state}) # If task is Running, mark that reboot will be needed after # completion and let it continue if task_state == sushy.TASK_STATE_RUNNING: LOG.debug('NIC update task for node %(node)s is running. ' 'Will wait for completion then reboot.', {'node': node.uuid}) # Clear flags since we're past the starting phase current_update.pop(NIC_STARTING_TIMESTAMP, None) current_update.pop(NIC_REBOOT_TRIGGERED, None) # Mark that reboot will be needed after completion current_update[NIC_NEEDS_POST_COMPLETION_REBOOT] = True node.set_driver_internal_info('redfish_fw_updates', settings) node.save() return False # Continue polling until completion # If task is in STARTING, check if we need to wait or reboot if task_state == sushy.TASK_STATE_STARTING: # Check if we already triggered a reboot if current_update.get(NIC_REBOOT_TRIGGERED): LOG.debug('NIC firmware update for node %(node)s: ' 'reboot already triggered, waiting for task ' 'to progress.', {'node': node.uuid}) return False # Continue polling starting_time = current_update.get(NIC_STARTING_TIMESTAMP) if not starting_time: # First time seeing STARTING - record timestamp current_update[NIC_STARTING_TIMESTAMP] = str( timeutils.utcnow().isoformat()) node.set_driver_internal_info( 'redfish_fw_updates', settings) node.save() LOG.debug('NIC firmware update task for node %(node)s ' 'is in STARTING state. Waiting to determine if ' 'reboot is needed to start update.', {'node': node.uuid}) return False # Keep polling # Check if configured wait time has elapsed start_time = timeutils.parse_isotime(starting_time) elapsed = timeutils.utcnow(True) - start_time nic_starting_wait = ( CONF.redfish.firmware_update_nic_starting_wait) if elapsed.seconds < nic_starting_wait: # Still within wait window, keep waiting LOG.debug('NIC update for node %(node)s still in ' 'STARTING after %(elapsed)s seconds. ' 'Waiting...', {'node': node.uuid, 'elapsed': elapsed.seconds}) return False # Keep polling # Wait time elapsed and still STARTING - need reboot to start LOG.info('NIC firmware update task for node %(node)s ' 'remained in STARTING state for %(wait)s+ seconds. ' 'Hardware requires reboot to start update. ' 'Triggering reboot.', {'node': node.uuid, 'wait': nic_starting_wait}) # Mark that we triggered a reboot to prevent repeat reboots current_update[NIC_REBOOT_TRIGGERED] = True # Clean up timestamp current_update.pop(NIC_STARTING_TIMESTAMP, None) node.set_driver_internal_info('redfish_fw_updates', settings) node.save() # Trigger the reboot to start update power_timeout = current_update.get('power_timeout', 0) manager_utils.node_power_action(task, states.REBOOT, power_timeout) LOG.info('Reboot initiated for node %(node)s to start ' 'NIC firmware update', {'node': node.uuid}) return True # Stop polling, reboot triggered except Exception as e: LOG.warning('Unable to check NIC task state for node ' '%(node)s: %(error)s. Will retry.', {'node': node.uuid, 'error': e}) return False # Continue polling on error def _handle_bios_task_starting(self, task, task_monitor, settings, current_update): """Handle BIOS firmware update task when it starts. BIOS updates require a reboot to apply the firmware, so we trigger the reboot as soon as the update task reaches STARTING state rather than waiting for task completion. :param task: a TaskManager instance :param task_monitor: the sushy task monitor :param settings: firmware update settings :param current_update: the current firmware update being processed :returns: True if reboot was triggered, False otherwise """ if current_update.get(BIOS_REBOOT_TRIGGERED): # Already triggered, just keep polling return False # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() node = task.node try: sushy_task = task_monitor.get_task() LOG.debug('BIOS update task state for node %(node)s: ' '%(state)s', {'node': node.uuid, 'state': sushy_task.task_state}) # Check if task has started (STARTING state or beyond) # TaskState can be: New, Starting, Running, Suspended, # Interrupted, Pending, Stopping, Completed, Killed, # Exception, Service, Cancelling, Cancelled if sushy_task.task_state in [sushy.TASK_STATE_STARTING, sushy.TASK_STATE_RUNNING, sushy.TASK_STATE_PENDING]: LOG.info('BIOS firmware update task has started for ' 'node %(node)s (state: %(state)s). ' 'Triggering reboot to apply update.', {'node': node.uuid, 'state': sushy_task.task_state}) # Mark reboot as triggered to avoid repeated reboots current_update[BIOS_REBOOT_TRIGGERED] = True node.set_driver_internal_info( 'redfish_fw_updates', settings) node.save() # Trigger the reboot power_timeout = current_update.get('power_timeout', 0) manager_utils.node_power_action(task, states.REBOOT, power_timeout) LOG.info('Reboot initiated for node %(node)s to apply ' 'BIOS firmware update', {'node': node.uuid}) return True except Exception as e: LOG.warning('Unable to check BIOS task state for node ' '%(node)s: %(error)s. Will retry.', {'node': node.uuid, 'error': e}) return False def _handle_wait_completion(self, task, update_service, settings, current_update): """Handle firmware update wait completion. :param task: a TaskManager instance :param update_service: the sushy firmware update service :param settings: firmware update settings :param current_update: the current firmware update being processed """ # Upgrade lock at the start since we may modify driver_internal_info task.upgrade_lock() node = task.node # Check if this is BMC version checking if current_update.get('bmc_version_checking'): current_update.pop('bmc_version_checking', None) node.set_driver_internal_info( 'redfish_fw_updates', settings) node.save() # Continue BMC version checking self._handle_bmc_update_completion( task, update_service, settings, current_update) elif current_update.get('component_type') == redfish_utils.BMC: # BMC update wait expired - check if task is still running # before transitioning to version checking task_still_running = False try: task_monitor = redfish_utils.get_task_monitor( node, current_update['task_monitor']) if task_monitor.is_processing: task_still_running = True LOG.debug('BMC firmware update wait expired but task ' ' still processing for node %(node)s. ' 'Continuing to monitor task completion.', {'node': node.uuid}) except exception.RedfishConnectionError as e: LOG.debug('Unable to communicate with task monitor for node ' '%(node)s during wait completion: %(error)s. ' 'BMC may be resetting, will transition to version ' 'checking.', {'node': node.uuid, 'error': e}) except exception.RedfishError as e: LOG.debug('Task monitor unavailable for node %(node)s: ' '%(error)s. Task may have completed, transitioning ' 'to version checking.', {'node': node.uuid, 'error': e}) if task_still_running: # Task is still running, continue to monitor task completion # Don't transition to version checking yet. node.set_driver_internal_info('redfish_fw_updates', settings) node.save() return # Task completed, deleted or BMC unavailable # Transition to version checking LOG.info('BMC firmware update wait expired for node %(node)s. ' 'Task completed or unavailable. Transitioning to version ' 'checking mode.', {'node': node.uuid}) self._handle_bmc_update_completion( task, update_service, settings, current_update) else: # Regular wait completion - mark reboot needed if this is the # last update. Note: BIOS components reboot immediately when # task starts, so they won't use this path. if len(settings) == 1: component = current_update.get('component', '') component_type = redfish_utils.get_component_type(component) # For default/unknown components, reboot may be needed if component_type is None: node.set_driver_internal_info( FIRMWARE_REBOOT_REQUESTED, True) node.save() # Continue with updates self._continue_updates(task, update_service, settings) def _check_overall_timeout(self, task): """Check if firmware update has exceeded overall timeout. :param task: A TaskManager instance :returns: True if timeout exceeded and error was handled, False otherwise """ node = task.node overall_timeout = CONF.redfish.firmware_update_overall_timeout if overall_timeout <= 0: return False start_time_str = node.driver_internal_info.get( 'redfish_fw_update_start_time') if not start_time_str: return False start_time = timeutils.parse_isotime(start_time_str) elapsed = timeutils.utcnow(True) - start_time if elapsed.total_seconds() < overall_timeout: return False msg = (_('Firmware update on node %(node)s has exceeded ' 'the overall timeout of %(timeout)s seconds. ' 'Elapsed time: %(elapsed)s seconds.') % {'node': node.uuid, 'timeout': overall_timeout, 'elapsed': int(elapsed.total_seconds())}) LOG.error(msg) task.upgrade_lock() self._clear_updates(node) manager_utils.servicing_error_handler(task, msg, traceback=False) return True def _handle_firmware_update_task(self, task, node, current_update, update_service, settings): """Handle the firmware update task monitoring and completion. :param task: a TaskManager instance :param node: an Ironic node object :param current_update: the current firmware update being processed :param update_service: the sushy firmware update service :param settings: firmware update settings """ try: task_monitor = redfish_utils.get_task_monitor( node, current_update['task_monitor']) except exception.RedfishConnectionError as e: # If the BMC firmware is being updated, the BMC will be # unavailable for some amount of time. LOG.warning('Unable to communicate with task monitor service ' 'on node %(node)s. Will try again on the next poll. ' 'Error: %(error)s', {'node': node.uuid, 'error': e}) return except exception.RedfishError: # The BMC deleted the Task before we could query it LOG.warning('Firmware update completed for node %(node)s, ' 'firmware %(firmware_image)s, but success of the ' 'update is unknown. Assuming update was successful.', {'node': node.uuid, 'firmware_image': current_update['url']}) self._continue_updates(task, update_service, settings) return try: # The last response does not necessarily contain a Task, # so get it sushy_task = task_monitor.get_task() task_state = sushy_task.task_state except Exception as e: LOG.warning('Unable to get task for node %(node)s: %(error)s. ' 'Will retry on next poll.', {'node': node.uuid, 'error': e}) return # Check if task is in a terminal state (completed, failed, etc.) # If so, proceed directly to completion handling if task_state not in [sushy.TASK_STATE_RUNNING, sushy.TASK_STATE_STARTING, sushy.TASK_STATE_PENDING]: # Taks is done (COMPLETED, EXCEPTION, KILLED, CANCELLED, etc.) # Parse messages and handle completion LOG.debug('Firmware update task in terminal state %(state)s ' 'for node %(node)s', {'state': task_state, 'node': node.uuid}) # Only parse the messages if the BMC did not return parsed # messages messages = [] if sushy_task.messages and not sushy_task.messages[0].message: sushy_task.parse_messages() if sushy_task.messages is not None: for m in sushy_task.messages: msg = m.message if not msg or msg.lower() in ['unknown', 'unknown error']: msg = m.message_id if msg: messages.append(msg) task.upgrade_lock() self._handle_task_completion(task, sushy_task, messages, update_service, settings, current_update) return # Task is still in progress (RUNNING, STARTING, or PENDING) # Special handling for BIOS and NIC updates component = current_update.get('component', '') component_type = redfish_utils.get_component_type(component) if component_type == redfish_utils.BIOS: # For BIOS, check if task has reached STARTING state # and trigger reboot immediately if self._handle_bios_task_starting(task, task_monitor, settings, current_update): return # Reboot triggered, done # Task is still processing, keep polling return if component_type == redfish_utils.NIC: # For NIC, wait 30s to see if hardware needs reboot if self._handle_nic_task_starting(task, task_monitor, settings, current_update): return # Reboot triggered, done # Task is still processing (or waiting), keep polling return # For other component types, just log and keep polling LOG.debug('Firmware update in progress for node %(node)s, ' 'firmware %(firmware_image)s.', {'node': node.uuid, 'firmware_image': current_update['url']}) @METRICS.timer('RedfishFirmware._check_node_redfish_firmware_update') def _check_node_redfish_firmware_update(self, task): """Check the progress of running firmware update on a node.""" node = task.node # Check overall timeout for firmware update operation if self._check_overall_timeout(task): return settings = node.driver_internal_info['redfish_fw_updates'] current_update = settings[0] try: update_service = redfish_utils.get_update_service(node) except exception.RedfishConnectionError as e: # If the BMC firmware is being updated, the BMC will be # unavailable for some amount of time. LOG.warning('Unable to communicate with firmware update service ' 'on node %(node)s. Will try again on the next poll. ' 'Error: %(error)s', {'node': node.uuid, 'error': e}) return # Touch provisioning to indicate progress is being monitored. # This prevents heartbeat timeout from triggering for steps that # don't require the ramdisk agent (requires_ramdisk=False). # Note: Only touch after successful BMC communication to ensure # the process eventually times out if the BMC is unresponsive. node.touch_provisioning() wait_start_time = current_update.get('wait_start_time') if wait_start_time: wait_start = timeutils.parse_isotime(wait_start_time) elapsed_time = timeutils.utcnow(True) - wait_start if elapsed_time.seconds >= current_update['wait']: LOG.debug('Finished waiting after firmware update ' '%(firmware_image)s on node %(node)s. ' 'Elapsed time: %(seconds)s seconds', {'firmware_image': current_update['url'], 'node': node.uuid, 'seconds': elapsed_time.seconds}) current_update.pop('wait', None) current_update.pop('wait_start_time', None) # Handle wait completion self._handle_wait_completion( task, update_service, settings, current_update) else: LOG.debug('Continuing to wait after firmware update ' '%(firmware_image)s on node %(node)s. ' 'Elapsed time: %(seconds)s seconds', {'firmware_image': current_update['url'], 'node': node.uuid, 'seconds': elapsed_time.seconds}) return # Handle firmware update task monitoring self._handle_firmware_update_task( task, node, current_update, update_service, settings) def _stage_firmware_file(self, node, component_update): try: url = component_update['url'] name = component_update['component'] parsed_url = urlparse(url) scheme = parsed_url.scheme.lower() source = (CONF.redfish.firmware_source).lower() # Keep it simple, in further processing TLS does not matter if scheme == 'https': scheme = 'http' # If source and scheme is HTTP, then no staging, # returning original location if scheme == 'http' and source == scheme: LOG.debug('For node %(node)s serving firmware for ' '%(component)s from original location %(url)s', {'node': node.uuid, 'component': name, 'url': url}) return url, None # If source and scheme is Swift, then not moving, but # returning Swift temp URL if scheme == 'swift' and source == scheme: temp_url = firmware_utils.get_swift_temp_url(parsed_url) LOG.debug('For node %(node)s serving original firmware at ' 'for %(component)s at %(url)s via Swift temporary ' 'url %(temp_url)s', {'node': node.uuid, 'component': name, 'url': url, 'temp_url': temp_url}) return temp_url, None # For remaining, download the image to temporary location temp_file = firmware_utils.download_to_temp(node, url) return firmware_utils.stage(node, source, temp_file) except exception.IronicException: firmware_utils.cleanup(node) raise