Path: blob/master/tools/dev/update_gitlab_versions.rb
74483 views
#!/usr/bin/env ruby1# -*- coding: binary -*-23#4# by h00die5#6# Fetches all stable GitLab EE and CE Docker tags newer than the highest version7# already present in version.json, then pulls only the application layer blob8# directly from the Docker Registry API (no Docker daemon required) and streams9# it through gzip+tar to extract the application-HASH.css filename.10#11# Requirements: none beyond Ruby stdlib12#13# Usage:14# ruby tools/dev/update_gitlab_versions.rb [options]15#1617require 'optparse'18require 'net/http'19require 'uri'20require 'json'21require 'set'22require 'zlib'2324# -- paths / constants ---------------------------------------------------------2526JSON_FILE = File.expand_path('../../data/gitlab_versions.json', __dir__)27REGISTRY = 'https://registry-1.docker.io/v2'.freeze28AUTH_URL = 'https://auth.docker.io/token?service=registry.docker.io'.freeze29MAX_CONCURRENT = 43031EE_TAG_RE = /\A(\d+)\.(\d+)\.(\d+)-ee\.(\d+)\z/32CE_TAG_RE = /\A(\d+)\.(\d+)\.(\d+)-ce\.(\d+)\z/3334EDITIONS = [35{36repo: 'gitlab/gitlab-ee', tag_re: EE_TAG_RE, label: 'EE',37version_fn: ->(tag) { tag.sub(/-ee\.\d+\z/, '-ee') }38},39{40repo: 'gitlab/gitlab-ce', tag_re: CE_TAG_RE, label: 'CE',41version_fn: ->(tag) { tag.sub(/-ce\.\d+\z/, '-ce') }42}43].freeze4445# Prefer Docker v2 manifest types - OCI manifests may use zstd-compressed layers46# which we cannot decompress. Docker v2 layers are always gzip.47MANIFEST_ACCEPT = [48'application/vnd.docker.distribution.manifest.v2+json',49'application/vnd.docker.distribution.manifest.list.v2+json',50'application/vnd.oci.image.index.v1+json',51'application/vnd.oci.image.manifest.v1+json'52].join(', ').freeze5354# -- colours -------------------------------------------------------------------5556class String57def red58"\e[1;31;40m#{self}\e[0m"59end6061def yellow62"\e[1;33;40m#{self}\e[0m"63end6465def green66"\e[1;32;40m#{self}\e[0m"67end6869def cyan70"\e[1;36;40m#{self}\e[0m"71end72end7374# -- helpers -------------------------------------------------------------------7576def tag_semver(tag, re)77m = re.match(tag)78return nil unless m7980[m[1].to_i, m[2].to_i, m[3].to_i]81end8283def parse_semver(str)84m = str.match(/\A(\d+)\.(\d+)\.(\d+)/)85m ? [m[1].to_i, m[2].to_i, m[3].to_i] : nil86end8788def http_get(url)89uri = URI(url)90Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https',91open_timeout: 15, read_timeout: 30) do |http|92http.request(Net::HTTP::Get.new(uri))93end94end9596# -- Docker Hub tag enumeration -------------------------------------------------9798def fetch_all_tags(repo)99tags = []100url = "https://hub.docker.com/v2/repositories/#{repo}/tags?page_size=100&ordering=last_updated"101102loop do103resp = http_get(url)104unless resp.is_a?(Net::HTTPSuccess)105warn " Docker Hub request failed (#{resp.code}): #{url}".red106break107end108109data = JSON.parse(resp.body)110tags.concat(data.fetch('results', []).map { |t| t['name'] })111112url = data['next']113break unless url114end115116tags117end118119# -- Streaming tar scanner ------------------------------------------------------120#121# Feeds decompressed gzip data into this object chunk by chunk. It walks tar122# headers (512 bytes each), skips file data blocks, and stops as soon as a123# path matching /assets/application-HASH.css is found. The caller can abort124# the HTTP download immediately after `found` is set, avoiding downloading the125# rest of the layer.126127# Handles three tar long-name extensions:128# PAX ('x'/'X' typeflag) - extended header with `path=<full>` key129# GNU ('L' typeflag) - next data block is the full filename130# USTAR prefix field - 155-byte prefix at offset 345 prepended to name131#132# When sample_re is set, collects all matching filenames into #samples instead133# of stopping at the first CSS_HASH_RE match. Used for --sample diagnostics.134class TarCssScanner135HEADER_SIZE = 512136CSS_HASH_RE = %r{/assets/application-([a-f0-9]+)\.css\z}137138attr_reader :found, :samples139140def initialize141@buf = ''.b142@skip = 0143@found = nil144@pax_path = nil # path extracted from PAX extended header145@gnu_name = nil # name extracted from GNU ././@LongLink block146@collect_type = nil # :pax or :gnu - currently collecting payload147@collect_need = 0 # bytes still needed148@collect_buf = ''.b149end150151def <<(data)152@buf << data.b153step while @found.nil? && enough?154self155end156157private158159def enough?160return !@buf.empty? if @collect_type161162@skip > 0 ? !@buf.empty? : @buf.size >= HEADER_SIZE163end164165def step166# -- collect mode: gather PAX / GNU payload ----------------------------167if @collect_type168take = [@collect_need, @buf.size].min169@collect_buf << @buf.byteslice(0, take)170@buf = @buf.byteslice(take..) || ''.b171@collect_need -= take172173return unless @collect_need <= 0174175case @collect_type176when :pax177# PAX lines: "<decimal-len> <key>=<value>\n" - keep binary to match binary literals178@collect_buf.scan(/\d+ path=([^\n]+)/) { |m| @pax_path = m[0] }179when :gnu180@gnu_name = @collect_buf.delete("\x00")181end182183@collect_type = nil184@collect_buf = ''.b185return186end187188# -- skip mode: discard data / padding blocks ---------------------------189if @skip > 0190take = [@skip, @buf.size].min191@buf = @buf.byteslice(take..)192@skip -= take193return194end195196# -- header mode --------------------------------------------------------197header = @buf.byteslice(0, HEADER_SIZE)198@buf = @buf.byteslice(HEADER_SIZE..) || ''.b199typeflag = header.byteslice(156, 1) || "\x00"200name = header.byteslice(0, 100).delete("\x00")201size = header.byteslice(124, 12).strip.to_i(8)202203case typeflag204when 'x', 'X' # PAX extended header205padded = ((size + 511) / 512) * 512206@collect_type = :pax207@collect_need = padded208@collect_buf = ''.b209when 'L' # GNU long filename follows in data block210padded = ((size + 511) / 512) * 512211@collect_type = :gnu212@collect_need = padded213@collect_buf = ''.b214else215return if name.empty? # end-of-archive zero block216217# Reconstruct full path: PAX > GNU > USTAR-prefix+name218effective = if @pax_path219@pax_path220elsif @gnu_name221@gnu_name222else223prefix = header.byteslice(345, 155).delete("\x00")224prefix.empty? ? name : "#{prefix}/#{name}"225end226227@pax_path = nil228@gnu_name = nil229230if (m = effective.match(CSS_HASH_RE))231@found = m[1]232return233end234235@skip = ((size + 511) / 512) * 512236end237end238end239240# Collects every filename in a gzip-compressed tar layer that matches a regex.241# Never stops early - reads the whole stream. Used for --sample diagnostics.242class TarFilenameCollector243HEADER_SIZE = 512244245attr_reader :filenames246247def initialize(pattern = /\.css\z/)248@pattern = pattern249@filenames = []250@buf = ''.b251@skip = 0252@collect_type = nil253@collect_need = 0254@collect_buf = ''.b255@pax_path = nil256@gnu_name = nil257end258259def <<(data)260@buf << data.b261step while enough?262self263end264265private266267def enough?268return !@buf.empty? if @collect_type269270@skip > 0 ? !@buf.empty? : @buf.size >= HEADER_SIZE271end272273def step274if @collect_type275take = [@collect_need, @buf.size].min276@collect_buf << @buf.byteslice(0, take)277@buf = @buf.byteslice(take..) || ''.b278@collect_need -= take279return unless @collect_need <= 0280281case @collect_type282when :pax283@collect_buf.scan(/\d+ path=([^\n]+)/) { |m| @pax_path = m[0] }284when :gnu285@gnu_name = @collect_buf.delete("\x00")286end287@collect_type = nil288@collect_buf = ''.b289return290end291292if @skip > 0293take = [@skip, @buf.size].min294@buf = @buf.byteslice(take..)295@skip -= take296return297end298299return unless @buf.size >= HEADER_SIZE300301header = @buf.byteslice(0, HEADER_SIZE)302@buf = @buf.byteslice(HEADER_SIZE..) || ''.b303typeflag = header.byteslice(156, 1) || "\x00"304name = header.byteslice(0, 100).delete("\x00")305size = header.byteslice(124, 12).strip.to_i(8)306307case typeflag308when 'x', 'X'309padded = ((size + 511) / 512) * 512310@collect_type = :pax311@collect_need = padded312@collect_buf = ''.b313when 'L'314padded = ((size + 511) / 512) * 512315@collect_type = :gnu316@collect_need = padded317@collect_buf = ''.b318else319return if name.empty?320321effective = if @pax_path then @pax_path322elsif @gnu_name then @gnu_name323else324prefix = header.byteslice(345, 155).delete("\x00")325prefix.empty? ? name : "#{prefix}/#{name}"326end327@pax_path = nil328@gnu_name = nil329330@filenames << effective if effective.match?(@pattern)331@skip = ((size + 511) / 512) * 512332end333end334end335336# -- Docker Registry API --------------------------------------------------------337338def registry_token(repo)339resp = http_get("#{AUTH_URL}&scope=repository:#{repo}:pull")340JSON.parse(resp.body)['token']341end342343def registry_get(uri, token)344Net::HTTP.start(uri.host, uri.port, use_ssl: true, open_timeout: 15, read_timeout: 30) do |http|345req = Net::HTTP::Get.new(uri)346req['Authorization'] = "Bearer #{token}"347req['Accept'] = MANIFEST_ACCEPT348http.request(req)349end350end351352# Wraps registry_get with automatic retry on 429 (rate limit).353# Sleeps for Retry-After seconds (or exponential backoff) before retrying.354def registry_get_with_retry(uri, token, max_retries: 3)355(max_retries + 1).times do |attempt|356resp = registry_get(uri, token)357return resp unless resp.code == '429'358359wait = [(resp['retry-after'].to_i.nonzero? || 2**attempt * 15), 120].min360warn " [rate limited (429), waiting #{wait}s before retry #{attempt + 1}/#{max_retries}]".yellow361sleep wait362end363registry_get(uri, token)364end365366# Returns the v2 image manifest for a given tag, resolving manifest lists to367# the linux/amd64 platform entry automatically.368# Returns nil on error, :expired on 401 (token needs refresh).369def fetch_manifest(repo, tag, token, verbose: false)370uri = URI("#{REGISTRY}/#{repo}/manifests/#{tag}")371resp = registry_get_with_retry(uri, token)372return :expired if resp.code == '401'373374if !resp.is_a?(Net::HTTPSuccess)375warn " manifest #{tag}: HTTP #{resp.code}".yellow if verbose376return nil377end378379manifest = JSON.parse(resp.body)380ct = resp['content-type'].to_s381382# Multi-platform manifest list - drill down to linux/amd64383if ct.include?('manifest.list') || ct.include?('image.index')384entry = manifest['manifests']&.find do |m|385m.dig('platform', 'os') == 'linux' && m.dig('platform', 'architecture') == 'amd64'386end387return nil unless entry388389resp = registry_get_with_retry(URI("#{REGISTRY}/#{repo}/manifests/#{entry['digest']}"), token)390return :expired if resp.code == '401'391392if !resp.is_a?(Net::HTTPSuccess)393warn " manifest #{tag} (amd64): HTTP #{resp.code}".yellow if verbose394return nil395end396397manifest = JSON.parse(resp.body)398end399400manifest401end402403# Streams a single gzip-compressed layer blob and scans tar headers for the404# application-HASH.css filename. Stops the download as soon as the hash is405# found - no need to consume the full layer.406#407# Registry blobs usually redirect (302) to cloud storage (S3/GCS/CDN); the408# redirect is followed manually so we never send the registry Bearer token to409# a third-party host.410def scan_layer(repo, digest, token, mediatype: nil, verbose: false)411if mediatype&.include?('zstd')412warn " [skip zstd] #{digest[7, 16]}...".yellow if verbose413return nil414end415warn " [scan] #{digest[7, 16]}... (#{mediatype || 'unknown'})".cyan if verbose416417blob_uri = URI("#{REGISTRY}/#{repo}/blobs/#{digest}")418redirect_uri = nil419inline = nil420421# First request: registry endpoint - expect a 302 to cloud storage422Net::HTTP.start(blob_uri.host, blob_uri.port, use_ssl: true,423open_timeout: 15, read_timeout: 30) do |http|424req = Net::HTTP::Get.new(blob_uri)425req['Authorization'] = "Bearer #{token}"426http.request(req) do |resp|427case resp428when Net::HTTPRedirection429redirect_uri = URI(resp['location'])430when Net::HTTPSuccess431inline = scan_blob_stream(resp)432end433end434end435436return inline unless redirect_uri437438# Second request: cloud storage - stream and scan, no auth header439result = nil440Net::HTTP.start(redirect_uri.host, redirect_uri.port, use_ssl: true,441open_timeout: 15, read_timeout: 300) do |http|442req = Net::HTTP::Get.new(redirect_uri)443http.request(req) do |resp|444result = scan_blob_stream(resp) if resp.is_a?(Net::HTTPSuccess)445end446end447result448rescue Zlib::Error, Errno::ECONNRESET, Net::ReadTimeout, OpenSSL::SSL::SSLError => e449warn " layer #{digest[7, 16]}... #{e.class}: #{e.message}".yellow450nil451end452453# Streams resp body through gzip decompression and a TarCssScanner.454# Throws :done as soon as the CSS hash is found so the caller's read_body455# loop exits early and the rest of the blob is not downloaded.456def scan_blob_stream(resp)457return nil unless resp.is_a?(Net::HTTPSuccess)458459scanner = TarCssScanner.new460inflater = Zlib::Inflate.new(Zlib::MAX_WBITS | 16) # gzip mode461462begin463catch(:done) do464resp.read_body do |chunk|465scanner << inflater.inflate(chunk)466throw :done if scanner.found467end468end469scanner.found470ensure471begin472inflater.close473rescue StandardError474nil475end476end477end478479# Streams the full layer and collects every filename matching pattern.480def collect_layer_filenames(repo, digest, token, pattern: /\.css\z/)481blob_uri = URI("#{REGISTRY}/#{repo}/blobs/#{digest}")482redirect_uri = nil483484Net::HTTP.start(blob_uri.host, blob_uri.port, use_ssl: true,485open_timeout: 15, read_timeout: 30) do |http|486req = Net::HTTP::Get.new(blob_uri)487req['Authorization'] = "Bearer #{token}"488http.request(req) do |resp|489return stream_filenames(resp, pattern) if resp.is_a?(Net::HTTPSuccess)490491redirect_uri = URI(resp['location']) if resp.is_a?(Net::HTTPRedirection)492end493end494495return [] unless redirect_uri496497Net::HTTP.start(redirect_uri.host, redirect_uri.port, use_ssl: true,498open_timeout: 15, read_timeout: 300) do |http|499req = Net::HTTP::Get.new(redirect_uri)500http.request(req) do |resp|501return stream_filenames(resp, pattern) if resp.is_a?(Net::HTTPSuccess)502end503end504[]505rescue StandardError => e506warn " collect error: #{e.class}: #{e.message}".red507[]508end509510def stream_filenames(resp, pattern)511collector = TarFilenameCollector.new(pattern)512inflater = Zlib::Inflate.new(Zlib::MAX_WBITS | 16)513begin514resp.read_body { |chunk| collector << inflater.inflate(chunk) }515collector.filenames516ensure517begin518inflater.close519rescue StandardError520nil521end522end523end524525# Prints all filenames matching pattern across all layers of repo:tag.526def sample_tag(repo, tag, pattern: /\.css\z/)527puts "\nSampling #{repo}:#{tag} for filenames matching #{pattern}...".cyan528token = registry_token(repo)529manifest = fetch_manifest(repo, tag, token)530unless manifest531warn ' Could not fetch manifest'.red532return533end534535layers = manifest['layers']&.reverse || []536puts " #{layers.size} layer(s), scanning newest-first...".cyan537538layers.each_with_index do |layer, idx|539next if layer['mediaType']&.include?('zstd')540541print " Layer #{idx + 1}/#{layers.size} #{layer['digest'][7, 16]}... "542$stdout.flush543files = collect_layer_filenames(repo, layer['digest'], token, pattern: pattern)544puts "(#{files.size} match#{files.size == 1 ? '' : 'es'})"545files.each { |f| puts " #{f}" }546end547end548549# Fetches the manifest for repo:tag and scans layers newest-first.550# token_box is a single-element array [token] shared across threads; mutex551# protects refreshes so only one thread re-fetches when the token expires.552def get_css_hash(repo, tag, token_box, token_mutex, verbose: false)553token = token_mutex.synchronize { token_box[0] }554manifest = fetch_manifest(repo, tag, token, verbose: verbose)555556if manifest == :expired557token_mutex.synchronize do558# Only refresh if another thread hasn't already done it559if token_box[0] == token560token_box[0] = registry_token(repo)561warn " [token refreshed for #{repo}]".yellow if verbose562end563end564token = token_mutex.synchronize { token_box[0] }565manifest = fetch_manifest(repo, tag, token, verbose: verbose)566return nil if manifest == :expired567end568569return nil unless manifest570571layers = manifest['layers']&.reverse572return nil if layers.nil? || layers.empty?573574warn " #{layers.size} layer(s) found, scanning newest-first...".cyan if verbose575layers.each do |layer|576result = scan_layer(repo, layer['digest'], token, mediatype: layer['mediaType'], verbose: verbose)577return result if result578end579nil580end581582# -- JSON file helpers ----------------------------------------------------------583584def load_json_map585JSON.parse(File.read(JSON_FILE))586end587588def max_version_in_map(data)589data.values.flatten.filter_map { |v| parse_semver(v) }.max590end591592def collapse_ranges(version_hashes)593entries = []594version_hashes.each do |ver, hash|595if entries.last && entries.last[:hash] == hash596entries.last[:high] = ver597else598entries << { hash: hash, low: ver, high: ver }599end600end601entries602end603604def write_json_map(data)605lines = data.map { |k, v| " #{k.to_json}: #{v.to_json}" }606File.write(JSON_FILE, "{\n#{lines.join(",\n")}\n}\n")607end608609def update_version_file(new_entries, dry_run:)610data = load_json_map611added = []612updated = []613614new_entries.each do |e|615if data.key?(e[:hash])616# Hash already known - extend the high end of the range if the new version is higher.617# When semvers are equal but suffixes differ, prefer -ee over -ce.618existing_high_str = data[e[:hash]][1]619existing_high = parse_semver(existing_high_str)620new_high = parse_semver(e[:high])621next unless new_high && existing_high622623cmp = new_high <=> existing_high624next if cmp < 0625# Same semver: only replace if we're upgrading from -ce to -ee626next if cmp == 0 && !(existing_high_str.end_with?('-ce') && e[:high].end_with?('-ee'))627628data[e[:hash]][1] = e[:high] unless dry_run629updated << e630else631data[e[:hash]] = [e[:low], e[:high]] unless dry_run632added << e633end634end635636if added.empty? && updated.empty?637puts 'No new entries to add - already up to date.'.green638return639end640641tag = dry_run ? ' [dry-run]' : ''642unless added.empty?643puts "\n#{added.size} new entr#{added.size == 1 ? 'y' : 'ies'} added#{tag}:".green644added.each { |e| puts " #{e[:hash].to_json}: #{[e[:low], e[:high]].to_json}" }645end646unless updated.empty?647puts "\n#{updated.size} existing entr#{updated.size == 1 ? 'y' : 'ies'} range-extended#{tag}:".cyan648updated.each { |e| puts " #{e[:hash][0, 16]}... high -> #{e[:high]}" }649end650651write_json_map(data) unless dry_run652end653654def process_edition(edition, current_max, opts)655repo = edition[:repo]656tag_re = edition[:tag_re]657label = edition[:label]658version_fn = edition[:version_fn]659660puts "\nFetching GitLab #{label} tags from Docker Hub..."661all_tags = fetch_all_tags(repo)662puts " #{all_tags.size} total tags fetched."663664candidates = all_tags.select { |t| tag_re.match?(t) }.select do |t|665sv = tag_semver(t, tag_re)666sv && (sv <=> current_max) > 0667end.sort_by { |t| tag_semver(t, tag_re) }668669if candidates.empty?670puts " No new #{label} versions found.".green671return []672end673674puts " Found #{candidates.size} new #{label} tag(s):".cyan675candidates.each { |t| puts " #{t}" }676677if opts[:dry_run]678puts '[dry-run] skipping registry layer fetch'.cyan679return candidates.map do |t|680{ hash: "dryrun#{'0' * 57}", low: version_fn.call(t), high: version_fn.call(t) }681end682end683684token_box = [registry_token(repo)]685token_mutex = Mutex.new686lock = Mutex.new687results = {}688work = Queue.new689candidates.each { |t| work << t }690691puts " Fetching CSS hashes (#{[MAX_CONCURRENT, candidates.size].min} parallel workers)...".cyan692693workers = [MAX_CONCURRENT, candidates.size].min.times.map do694Thread.new do695loop do696tag = begin; work.pop(true); rescue ThreadError; break; end697begin698ver = version_fn.call(tag)699hash = get_css_hash(repo, tag, token_box, token_mutex, verbose: opts[:verbose])700lock.synchronize do701if hash702puts " #{tag} ... #{hash[0, 16].green}..."703else704puts " #{tag} ... #{'no CSS hash found'.yellow}"705end706results[tag] = [ver, hash] if hash707end708rescue StandardError => e709lock.synchronize { warn " #{tag} ... #{e.class}: #{e.message}".red }710end711end712end713end714715workers.each(&:join)716717ordered = candidates.filter_map { |t| results[t] }718collapse_ranges(ordered)719end720721# -- CLI -----------------------------------------------------------------------722723options = { dry_run: false, verbose: false, sample: nil }724725OptionParser.new do |opts|726opts.banner = 'Usage: ruby tools/dev/update_gitlab_versions.rb [options]'727opts.separator ''728opts.separator 'Fetches GitLab EE/CE tags from Docker Hub, streams only the'729opts.separator 'application layer from the Docker Registry API (no Docker daemon'730opts.separator 'required), and updates version.json directly.'731opts.separator ''732733opts.on('-n', '--dry-run', 'Show what would be added without modifying any files') do734options[:dry_run] = true735end736737opts.on('-v', '--verbose', 'Print layer mediatypes and scan progress') do738options[:verbose] = true739end740741opts.on('-s', '--sample REPO:TAG',742'Dump all .css filenames from all layers of REPO:TAG (e.g. gitlab/gitlab-ce:17.0.0-ce.0)') do |val|743options[:sample] = val744end745746opts.on('-h', '--help', 'Display this help') do747puts opts748exit749end750end.parse!751752# -- sample mode ---------------------------------------------------------------753754if options[:sample]755repo, tag = options[:sample].split(':', 2)756abort 'Usage: --sample REPO:TAG (e.g. gitlab/gitlab-ce:17.0.0-ce.0)'.red unless repo && tag757sample_tag(repo, tag)758exit759end760761# -- main ----------------------------------------------------------------------762763data = load_json_map764current_max = max_version_in_map(data)765abort 'Could not determine current max version from version.json'.red unless current_max766767puts "Current max version in GITLAB_CSS_MAP: #{current_max.join('.')}".cyan768769all_entries = EDITIONS.flat_map { |ed| process_edition(ed, current_max, options) }770771update_version_file(all_entries, dry_run: options[:dry_run])772773774