From 57bdad10fbafaaeee798dd4e56a2505b949f8a63 Mon Sep 17 00:00:00 2001 From: James Antill Date: Tue, 19 Aug 2025 19:10:58 -0400 Subject: [PATCH] updates-uptimes: Use machine data to find reboot/etc. utf8 diff. uptime-max. Signed-off-by: James Antill --- files/scripts/updates-uptime-cmd.py | 205 ++++++++++++++---- ...generate-updates-uptimes-per-host-file.yml | 35 ++- 2 files changed, 188 insertions(+), 52 deletions(-) diff --git a/files/scripts/updates-uptime-cmd.py b/files/scripts/updates-uptime-cmd.py index fcedca3a35..840d8b7844 100755 --- a/files/scripts/updates-uptime-cmd.py +++ b/files/scripts/updates-uptime-cmd.py @@ -20,8 +20,9 @@ # $0 list '*.stg.*' ... see what staging looks like. # $0 list '*copr*' ... see what copr looks like. # $0 history-keep 4 ... keep four days of history (including today) -# $0 uptime 1d ... see what hasn't been rebooted in the last 24 hours. -# $0 uptime 25w ... see what hasn't been rebooted in too damn long. +# $0 uptime-min 1d ... see what hasn't been rebooted in the last 24 hours. +# $0 uptime-max 1d ... see what has been rebooted in the last 24 hours. +# $0 uptime-min 25w ... see what hasn't been rebooted in too damn long. # $0 update-daily-refresh ... daily update, including a new history, and # refresh the main file (so any old hosts aren't there anymore). @@ -29,15 +30,31 @@ import os import sys import fnmatch -import glob import locale import shutil import time +# Use utf8 prefixes in diff +conf_utf8_diff = True +_conf_utf8_boot_ed = '⚠' # Rebooted +_conf_utf8_boot_up = '⚐' # Rebooted and updated +_conf_utf8_more_up = '➚' +_conf_utf8_less_up = '➘' +_conf_utf8_diff_os = '➜' # '♺' OSinfo is different, but the machine is the same +_conf_utf8_diff_hw = '⇉' # '모' machine_id is different + # If we try to update this seconds since the file changed, flush the # ansible FACT cache. conf_dur_flush_cache = (60*60*8) +# This is kind of a hack, if you run from a cron job then it should run at +# the same time each day, and this should be 1 hour or less. But life isn't +# perfect, so we give it some more time. +# The two competing problems are 1) reboot machine with low uptime. +# 2) get data at 23:59 yesterday and 0:01 today. +# ...and we can't fix both. +conf_tmdiff_fudge = (60*60*8) + # How many hosts to show in tier 4 updates/uptimes... conf_stat_4_hosts = 4 @@ -81,11 +98,31 @@ except locale.Error: locale.setlocale(locale.LC_ALL, 'C') -fname = conf_path + "ansible-list-updates-uptime.txt" +_fname = "ansible-list-updates-uptime.txt" +fname = conf_path + _fname backup_today = time.strftime("%Y-%m-%d", time.gmtime()) fname_today = fname + '.' + backup_today -backups = sorted(x.removeprefix(fname + '.') for x in glob.glob(fname + '.*')) +# History files are named .YYYY-MM-DD +def _glob_hist_suffix(): + for fn in os.listdir(os.path.dirname(fname)): + if not fn.startswith(_fname + '.'): + continue + fn = fn.removeprefix(_fname + '.') + if len(fn) != len("YYYY-MM-DD"): + continue + if fn[0] != '2': continue # Year + if fn[1] not in "0123456789": continue + if fn[2] not in "0123456789": continue + if fn[3] not in "0123456789": continue + if fn[4] != '-': continue + if fn[5] not in "01": continue # Month + if fn[6] not in "0123456789": continue + if fn[7] != '-': continue + if fn[8] not in "0123": continue # Day + if fn[9] not in "0123456789": continue + yield fn +backups = sorted(_glob_hist_suffix()) tm_yesterday = int(time.time()) - (60*60*24) backup_yesterday = time.strftime("%Y-%m-%d", time.gmtime(tm_yesterday)) @@ -113,7 +150,7 @@ class Host(): """ Class for holding the Host data from a line in the files. """ __slots__ = ['name', 'rpms', 'uptime', 'date', 'osname', 'osvers', - 'osname_small'] + 'osname_small', 'machine_id', 'boot_id'] def __init__ (self, data): global _max_len_osnm @@ -123,9 +160,13 @@ class Host(): self.rpms = data['rpms'] self.uptime = data['uptime'] self.date = data['date'] + self.osname = data['osname'] self.osvers = data['osvers'] + self.machine_id = data['machine_id'] + self.boot_id = data['boot_id'] + if False: pass elif self.osname == 'CentOS': osname_small = 'EL' @@ -156,6 +197,8 @@ class Host(): return False if self.osvers != other.osvers: return False + if self.machine_id != other.machine_id: + return False return True def __gt__(self, other): @@ -213,7 +256,7 @@ if len(sys.argv) >= 2: "stats", "update", "update-fast", "update-flush", "update-daily", "update-daily-refresh", - "uptime",): + "uptime", "uptime-min", "uptime-max",): cmd = sys.argv.pop(1) _tm_d = {'d' : 60*60*24, 'h' : 60*60, 'm' : 60, 's' : 1, @@ -267,6 +310,8 @@ def format_duration(seconds, short=False, static=False): if short: if dur == 0 and not static: return '<1h' + if dur == 0: + return '<01h' ret = [] dur = _add_dur(dur, ret, 24, "h", static=static) dur = _add_dur(dur, ret, 7, "d", static=static) @@ -304,11 +349,12 @@ cmp = None # This does arguments for a bunch of commands, like stats/list/etc. # by using fname1() after, which looks at cmp_arg. # But also does diff arguments. -def _cmp_arg(): +def _cmp_arg(usage=True): global cmp global cmp_arg if len(sys.argv) < 2 or sys.argv[1] == "main": + cmp_arg = False if len(sys.argv) >= 2: sys.argv.pop(1) cmp = backups[-1] # Most recent @@ -323,8 +369,9 @@ def _cmp_arg(): cmp = backup_yesterday cmp_arg = True elif sys.argv[1] not in backups: - _usage() - print("Backups:", ", ".join(backups)) + if usage: + _usage() + print("History:", ", ".join(backups)) sys.exit(1) else: cmp = sys.argv[1] @@ -339,8 +386,12 @@ def line2data(line): name, rpms, uptime, date = line.split(' ', 3) osname = "Unknown" osvers = "?" + machine_id = "?" + boot_id = "?" if ' ' in date: date, osname, osvers = date.split(' ', 2) + if ' ' in osvers: + osvers, machine_id, boot_id = osvers.split(' ', 2) rpms = int(rpms) uptime = int(uptime) @@ -358,12 +409,19 @@ def filter_name_datas(datas, name): yield data # Filter datas using uptime as a minium. -def filter_uptime_datas(datas, uptime): +def filter_uptime_min_datas(datas, uptime): for data in datas: if data.uptime < uptime: continue yield data +# Filter datas using uptime as a maximum. +def filter_uptime_max_datas(datas, uptime): + for data in datas: + if data.uptime > uptime: + continue + yield data + # Sub. suffix of DNS names for UI def _ui_name(name): for suffix in conf_suffix_dns_replace: @@ -420,6 +478,27 @@ def fname1(): return lines2datas(bfname2lines(cmp)) return fname_datas() +# Has the host been rebooted between these two points. +def host_rebooted(d1, d2): + if d1.boot_id != d2.boot_id: + return True + if d2.boot_id != '?': + return False + + # Now we try to work it out from uptime... + if d1.date == d2.date and d1.uptime > d2.uptime: + return True + # However, we can be looking at old history + tm1 = time.mktime(time.strptime(d1.date, "%Y-%m-%d")) + tm2 = time.mktime(time.strptime(d2.date, "%Y-%m-%d")) + if tm1 > tm2: # Looking backwards in time... + return False + d1up = d1.uptime + tmdiff = tm2 - tm1 + if tmdiff > conf_tmdiff_fudge: + d1up += tmdiff - conf_tmdiff_fudge + return d1up > d2.uptime + _max_len_name = 0 _max_len_rpms = 0 # Number of rpm updates via. _ui_int(). _max_len_upts = 0 # Uptime duration with short=True @@ -461,7 +540,7 @@ def _max_update_correct(prefix): # Return stats for updates added/deleted between two data sets. def _diffstats(data1, data2): - uadd, udel = 0, 0 + uadd, udel, boot = 0, 0, 0 data1 = list(sorted(data1)) data2 = list(sorted(data2)) @@ -469,6 +548,7 @@ def _diffstats(data1, data2): if len(data1) <= 0: d2 = data2.pop(0) uadd += d2.rpms + boot += 1 continue if len(data2) <= 0: d1 = data1.pop(0) @@ -486,14 +566,18 @@ def _diffstats(data1, data2): if d1.name > d2.name: uadd += d2.rpms data2.pop(0) + boot += 1 continue if d1 == d2: + if host_rebooted(d1, d2): + boot += 1 data1.pop(0) data2.pop(0) continue - if d1.osinfo != d2.osinfo: + if d1.machine_id != d2.machine_id or d1.osinfo != d2.osinfo: + boot += 1 udel -= d1.rpms uadd += d2.rpms data1.pop(0) @@ -501,6 +585,8 @@ def _diffstats(data1, data2): continue # Now name is eq and osinfo is eq + if host_rebooted(d1, d2): + boot += 1 # So either new updates arrived, or we installed some and they went # down ... alas. we can't tell if both happened. if d1.rpms > d2.rpms: @@ -511,11 +597,11 @@ def _diffstats(data1, data2): data2.pop(0) # diffstat returns... - return uadd, udel + return uadd, udel, boot def _ui_diffstats(data1, data2): cmpds = _diffstats(data1, data2) - return _ui_int(cmpds[0]), _ui_int(cmpds[1]) + return _ui_int(cmpds[0]), _ui_int(cmpds[1]), _ui_int(cmpds[2]) @@ -573,7 +659,7 @@ def _backup_suffix(backup): suffix = '' if backup == backup_today: if ident: - suffix = ' (today, is identical)' + suffix = ' (today, is eq)' else: suffix = ' (today)' if backup == backup_yesterday: @@ -590,7 +676,8 @@ if cmd in ("backups", "hist", "history"): # We _could_ open+read+etc each file, just to find out the max updates for # all hist ... but len("Updates")+2=9 which means 9,999,999 updates) hl = len("Hosts") - ul = len("Updates") + 2 + ul = len("Updates") + rl = len("Boots") if conf_fast_width_history: ul += 2 else: @@ -602,17 +689,17 @@ if cmd in ("backups", "hist", "history"): hl = max(hl, len(_ui_int(len(data)))) ul = max(ul, len(updates)) - print(" %10s %*s %*s %*s %*s" % ("Day", hl, "Hosts", - ul, "Updates", ul, "Avail", ul, "Inst.")) + print(" %10s %*s %*s %*s %*s %*s" % ("Day", hl, "Hosts", + ul, "Updates", ul, "Avail", ul+1, "Inst.", rl, "Boots")) for backup in reversed(backups): data = list(sorted(lines2datas(bfname2lines(backup)))) updates = _ui_int(sum(d.rpms for d in last_data)) ul = max(ul, len(updates)) cmpds = _ui_diffstats(data.copy(), last_data.copy()) - print(' %10s %*s %*s, %*s %*s, %s' % (last_name, + print(' %10s %*s %*s, %*s %*s, %*s %s' % (last_name, hl, _ui_int(len(last_data)), - ul, updates, ul, cmpds[0], ul+1, cmpds[1], last_suff)) + ul, updates, ul, cmpds[0], ul+1, cmpds[1], rl, cmpds[2], last_suff)) last_name = backup last_data = data last_suff = _backup_suffix(backup) @@ -666,9 +753,9 @@ if not os.path.exists(fname): if not _main_file_recent(): print(" Warning: Main file is old. Run update sub-command", file=sys.stderr) if fname_today is None: - print(" Warning: Backup for today does not exist!", file=sys.stderr) + print(" Warning: History for today does not exist!", file=sys.stderr) if fname_yesterday is None: - print(" Warning: Backup for yesterday does not exist!", file=sys.stderr) + print(" Warning: History for yesterday does not exist!", file=sys.stderr) def _cli_match_host(data): if len(sys.argv) >= 2: @@ -808,6 +895,8 @@ def _print_info(host, data): print(" Updates:", _ui_int(host.rpms)) print(" Uptime:", format_duration(host.uptime)) # !ui_dur print(" Checked:", host.date) + print(" Machine:", host.machine_id) + print(" Boot:", host.boot_id) if cmd in ("host", "info"): if cmd == "host": @@ -818,12 +907,18 @@ if cmd in ("host", "info"): host = sys.argv.pop(1) if len(sys.argv) >= 2 and sys.argv[1] == "all": for b in backups: - print("Backup:", b) + print("History:", b) _print_info(host, lines2datas(bfname2lines(b))) sys.argv = [sys.argv[0]] + while True: + _cmp_arg() + if cmp_arg: # One or more historical files... + print("History:", sys.argv.pop(1)) + else: print("Main:") - _cmp_arg() - _print_info(host, fname1()) + _print_info(host, fname1()) + if len(sys.argv) < 2: + break def _print_line(prefix, data): print("%s%-*s %*s %*s %*s %s" % (prefix, @@ -847,14 +942,17 @@ if cmd == "list": _print_line('', d1) _explain_ui_name() -if cmd == "uptime": +if cmd in ("uptime", "uptime-max", "uptime-min"): age = 0 if len(sys.argv) >= 2: age = parse_duration(sys.argv.pop(1)) _cmp_arg() data = fname1() - data = list(filter_uptime_datas(data, age)) + if cmd == "uptime-max": + data = list(filter_uptime_max_datas(data, age)) + else: + data = list(filter_uptime_min_datas(data, age)) _max_update(data) _max_update_correct('') for d1 in data: @@ -910,25 +1008,44 @@ if cmd in ("diff", "diff-u"): data2.pop(0) continue - if d1 == d2: - _print_line(' ', d2) - data1.pop(0) - data2.pop(0) - continue - - if cmd == "diff-u": - _print_line('-', d1) - data1.pop(0) - _print_line('+', d2) - data2.pop(0) - continue - - # diff + # d1.name == d2.name; so both are going now data1.pop(0) - _print_line('!', d2) data2.pop(0) + # Name, rpms, and OSname/OSvers are the same + if d1 == d2: + if cmd == "diff" and conf_utf8_diff and host_rebooted(d1, d2): + _print_line(_conf_utf8_boot_ed, d2) + continue + _print_line(' ', d2) + continue + + # Something about host changed, show old/new... + if cmd == "diff-u": + _print_line('-', d1) + _print_line('+', d2) + continue + + # Something changed, but we only show the new data... + if conf_utf8_diff: + if False: pass + elif d1.machine_id != d2.machine_id: + _print_line(_conf_utf8_diff_hw, d2) + elif d1.osinfo != d2.osinfo: + _print_line(_conf_utf8_diff_os, d2) + elif host_rebooted(d1, d2) and d1.rpms > d2.rpms: + _print_line(_conf_utf8_boot_up, d2) + elif host_rebooted(d1, d2): + _print_line(_conf_utf8_boot_ed, d2) + elif d1.rpms > d2.rpms: + _print_line(_conf_utf8_less_up, d2) + else: # d1.rpms < d2.rpms: + _print_line(_conf_utf8_more_up, d2) + continue + _print_line('!', d2) + continue - print('hosts=%s updates=%s (a=%s i=%s)' % (hosts, updates, cmpds[0],cmpds[1])) + print('hosts=%s updates=%s (a=%s i=%s) boots=%s' % (hosts, updates, + cmpds[0], cmpds[1], cmpds[2])) _explain_ui_name() diff --git a/playbooks/generate-updates-uptimes-per-host-file.yml b/playbooks/generate-updates-uptimes-per-host-file.yml index 5ecc432247..3116b78b9b 100644 --- a/playbooks/generate-updates-uptimes-per-host-file.yml +++ b/playbooks/generate-updates-uptimes-per-host-file.yml @@ -1,7 +1,5 @@ # -# simple playbook to list number of updates per. hosts and when they were last -# rebooted. -# It could be a lot faster if we didn't gather facts, but we need that uptime. +# simple playbook to see updates and uptimes per. host and OS info # --- @@ -10,28 +8,50 @@ - name: Check for updates hosts: distro_RedHat:distro_CentOS:distro_Fedora:!ocp*:!worker* - gather_facts: true tasks: +## We do this explicitly because ansible will cache facts, but we don't +## want to uncache all facts just make sure we have the latest uptime +## ** Doesn't seem to make any difference... +# - name: Gather the latest uptime and OS +# ansible.builtin.setup: +# - gather_subset: ["!all", "!min", "hardware"] +# - filter: ["uptime_seconds", "distribution", "distribution_version", "machine_id"] +# tags: updates + +# This should be in our facts, but I don't see it. Newer ansible? + - name: Gather boot-id, if we can + ansible.builtin.slurp: src=/proc/sys/kernel/random/boot_id + register: boot_id_data + ignore_errors: yes + tags: updates + + - name: Decode the real boot-id + ansible.builtin.set_fact: + boot_id: "{{ (boot_id_data.content | b64decode).strip() | default('?') }}" + tags: updates + # # We use the command module here because the real module can't expire # - name: Make dnf recheck for new metadata from repos ansible.builtin.command: dnf clean expire-cache - tags: expire + tags: + - expire + - updates - name: Check for updates (dnf) dnf: list=updates register: pkgoutput tags: updates - # This is identical to the Fedora one above... + # Dump all our information into a file - name: Generate the Upgrade+uptime report ansible.builtin.lineinfile: regexp: '^{{inventory_hostname}} ' - line: "{{inventory_hostname}} {{pkgoutput.results|length}} {{ansible_uptime_seconds}} {{ansible_date_time['date']}} {{ansible_distribution}} {{ansible_distribution_version}}" + line: "{{inventory_hostname}} {{pkgoutput.results|length}} {{ansible_uptime_seconds}} {{ansible_date_time['date']}} {{ansible_distribution}} {{ansible_distribution_version}} {{ansible_machine_id}} {{boot_id}}" path: /var/log/ansible-list-updates-uptime.txt create: yes delegate_to: localhost @@ -39,7 +59,6 @@ - name: Create a daily backup of the updates+uptime file. hosts: localhost - gather_facts: true tasks: - name: Sort and copy the file