Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/tools/dev/detect_dead_reference_links.rb
Views: 17951
##1# This script checks the status of URLs from a provided JSON file.2# It validates if URLs are alive, handles redirects, and fetches Wayback Machine snapshots for URLs that are down.3# It logs the status of each URL, including errors, redirects, and archived snapshots.4#5# Usage: ruby tools/dev/detect_dead_reference_links.rb -f db/modules_metadata_base.json -l WARN6#78require 'net/http'9require 'uri'10require 'json'11require 'csv'12require 'concurrent'13require 'logger'14require 'fileutils'15require 'optparse'16require 'benchmark'1718class UrlChecker19WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='20MAX_REDIRECTS = 5 # Maximum number of redirects to follow for each URL21THREAD_POOL_SIZE = 5 # Number of threads in the pool to process URLs concurrently22CHECKED_URLS_FILE = 'checked_urls.jsonl' # File to save URLs that have been checked23BATCH_SIZE = 1000 # Number of URLs to process in each batch24MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine25RETRY_DELAY = 5 # Delay in seconds between retries2627# Initializes the UrlChecker instance with given URLs and configuration options28# @param [Array<Hash>] urls_with_paths A list of URLs with associated paths to check29# @param [Logger::Level] log_level The logging level (defaults to Logger::INFO)30def initialize(urls_with_paths, log_level: Logger::INFO)31@urls_with_paths = urls_with_paths32@results = []33@checked_urls = load_checked_urls34@url_times = []35@logger = Logger.new($stdout)36@logger.level = log_level37@total_urls = urls_with_paths.size38@processed_urls = 039end4041# Starts the process of checking all URLs in batches, logging results and saving progress42# in a thread-safe manner.43def check_urls44pool = Concurrent::FixedThreadPool.new(THREAD_POOL_SIZE)45at_exit { shutdown_thread_pool(pool) }4647# Process URLs in batches to avoid overwhelming the system48@urls_with_paths.each_slice(BATCH_SIZE) do |batch|49futures = batch.map do |url_with_path|50Concurrent::Promises.future(pool) do51result = check_url(url_with_path)52@results << result53@checked_urls << url_with_path[:url]54save_progress(result)5556update_progress57end58end5960# Wait for all futures in the current batch to finish before proceeding61Concurrent::Promises.zip(*futures).wait!6263# Sleep between batches to avoid overloading the server64sleep 565end6667save_results_to_file68ensure69pool.shutdown70pool.wait_for_termination71@logger.info('Finished checking URLs.')72end7374private7576# Filters out URLs that have already been checked.77# @return [Array<Hash>] List of URLs and paths that have not been checked yet78def unchecked_urls79@urls_with_paths.reject { |url_with_path| @checked_urls.include?(url_with_path[:url]) }80end8182# Checks a single URL and processes its response.83# @param [Hash] url_with_path The URL and its associated path to check84# @return [Hash] A result containing the URL, path, status, and archived snapshot (if available)85def check_url(url_with_path)86url_result = { url: url_with_path[:url], path: url_with_path[:path], status: nil, archived_snapshot: nil }8788# Skip non-URL references and Wayback links89if !url_with_path[:url].start_with?('URL-')90url_result[:status] = 'Skipped (not a URL- reference)'91return url_result92elsif url_with_path[:url].start_with?('http://web.archive.org/web')93url_result[:status] = 'Wayback link (skipped)'94return url_result95end9697# Clean the URL and validate it98cleaned_url = url_with_path[:url].sub(/^URL-/, '')99100# Check if the URL is valid101if !valid_url?(cleaned_url)102url_result[:status] = 'Invalid URL'103return url_result104end105106# Prepare the HTTP request107uri = URI.parse(cleaned_url)108http = Net::HTTP.new(uri.host, uri.port)109http.use_ssl = uri.scheme == 'https'110111start_time = Time.now112113begin114# Get the HTTP response and handle redirects115response = get_response(http, uri)116follow_redirects(http, uri, response)117rescue StandardError => e118handle_error(url_result, e)119end120121# Process the response (check for success, failure, or error)122process_response(response, url_result)123elapsed_time = Time.now - start_time124@url_times << elapsed_time125126url_result127ensure128save_progress(url_result)129end130131# Validates if a URL is properly formatted132# @param [String] url The URL to check133# @return [Boolean] True if the URL is valid, false otherwise134def valid_url?(url)135URI.parse(url).is_a?(URI::HTTP)136rescue StandardError137false138end139140# Sends an HTTP GET request to the specified URI141# @param [Net::HTTP] http The HTTP client142# @param [URI] uri The URI to send the GET request to143# @return [Net::HTTPResponse] The HTTP response144def get_response(http, uri)145http.get(uri.request_uri)146end147148# Follows HTTP redirects up to a maximum limit (MAX_REDIRECTS)149# @param [Net::HTTP] http The HTTP client150# @param [URI] uri The original URI151# @param [Net::HTTPResponse] response The HTTP response to process152def follow_redirects(http, uri, response)153redirect_count = 0154while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS155location = response['location']156@logger.info("Redirecting to: #{location}")157uri = URI.parse(location)158response = http.get(uri.request_uri)159redirect_count += 1160end161end162163# Processes the HTTP response to determine the URL status164# @param [Net::HTTPResponse] response The HTTP response to process165# @param [Hash] url_result The result hash to update with the status166def process_response(response, url_result)167if response.nil?168url_result[:status] = 'Error: No response received'169elsif response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPRedirection)170url_result[:status] = 'Alive'171else172url_result[:status] = "Not Alive (Status Code: #{response.code})"173fetch_wayback_snapshot(url_result)174end175end176177# Handles errors encountered during URL checking (e.g., network errors)178# @param [Hash] url_result The result hash to update with error information179# @param [StandardError] error The error that was raised180def handle_error(url_result, error)181url_result[:status] = "Error: #{error.message}"182url_result[:archived_snapshot] = nil183end184185# Attempts to fetch the Wayback Machine snapshot for the URL186# @param [Hash] url_result The result hash to update with the Wayback snapshot information187def fetch_wayback_snapshot(url_result)188wayback_url = "#{WAYBACK_MACHINE_API_URL}#{url_result[:url]}"189retries = 0190191begin192uri = URI.parse(wayback_url)193response = Net::HTTP.get_response(uri)194handle_wayback_response(response, url_result)195rescue StandardError => e196retries += 1197if retries <= MAX_RETRIES198@logger.warn("Error fetching Wayback snapshot for #{url_result[:url]}: #{e.message}. Retrying in #{RETRY_DELAY} seconds... (Attempt #{retries} of #{MAX_RETRIES})")199sleep(RETRY_DELAY)200retry201else202url_result[:archived_snapshot] = "Error fetching Wayback snapshot after #{MAX_RETRIES} attempts: #{e.message}"203end204end205end206207# Processes the response from the Wayback Machine API208# @param [Net::HTTPResponse] response The response from the Wayback Machine209# @param [Hash] url_result The result hash to update with the archived snapshot URL210def handle_wayback_response(response, url_result)211if response.is_a?(Net::HTTPSuccess)212data = JSON.parse(response.body)213snapshot = data.dig('archived_snapshots', 'closest', 'url')214url_result[:archived_snapshot] = snapshot || 'No archived version found'215else216url_result[:archived_snapshot] = 'Error fetching Wayback Machine data'217end218end219220# Saves the final results of the URL checks to a JSON file221def save_results_to_file222File.open('url_check_results.json', 'w') { |file| file.write(JSON.pretty_generate(@results)) }223@logger.info('Results have been saved to "url_check_results.json".')224end225226# Saves the progress of checked URLs to a file227# @param [Hash] result The result of a single URL check228def save_progress(result)229File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }230end231232# Loads the list of already checked URLs from the progress file233# @return [Array<String>] A list of checked URLs234def load_checked_urls235return [] unless File.exist?(CHECKED_URLS_FILE)236237File.readlines(CHECKED_URLS_FILE).map { |row| JSON.parse(row)['url'] }238end239240# Shuts down the thread pool after URL checking is complete241# @param [Concurrent::FixedThreadPool] pool The thread pool to shut down242def shutdown_thread_pool(pool)243pool.shutdown244pool.wait_for_termination245@logger.info('Thread pool shut down successfully.')246end247248# Updates the progress bar based on the number of URLs processed249def update_progress250@processed_urls += 1251percentage = (@processed_urls.to_f / @total_urls * 100).round252bar_length = 50253progress = ('=' * (percentage / 2)).ljust(bar_length, ' ')254print "\r[#{progress}] #{percentage}% (#{@processed_urls}/#{@total_urls})"255end256end257258# Main entry point to run the URL checking process259if __FILE__ == $PROGRAM_NAME260options = {}261OptionParser.new do |opts|262opts.banner = 'Usage: ruby url_checker.rb [options]'263264opts.on('-f', '--file FILE', 'JSON file containing URLs and paths') do |file|265options[:file] = file266end267268opts.on('-l', '--log-level LEVEL', 'Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)') do |log_level|269options[:log_level] = log_level.upcase.to_sym270end271end.parse!272273# Validate input file274unless options[:file] && File.exist?(options[:file])275puts 'Please provide a valid JSON file with URLs and paths.'276exit 1277end278279# Handling for log level280log_level = options[:log_level] || 'INFO'281log_level = Logger.const_get(log_level)282283# Parse the JSON file containing URLs and paths284urls_with_paths = JSON.parse(File.read(options[:file]))285286# Map the data to the format required by the checker287mapped_data = urls_with_paths.flat_map do |_path, metadata|288metadata['references'].map { |ref| { 'path' => metadata['path'], 'ref' => ref } }289end290291# Validate the structure of the mapped data292unless mapped_data.is_a?(Array) && mapped_data.all? { |entry| entry['ref'] && entry['path'] }293puts "Invalid JSON structure. The file should contain an array of objects with 'ref' and 'path' keys."294exit 1295end296297# Create the final list of URLs and paths298urls_with_paths_final = mapped_data.map { |entry| { url: entry['ref'], path: entry['path'] } }299300start_time = Time.now301302# Create and run the UrlChecker instance303url_checker = UrlChecker.new(urls_with_paths_final, log_level: log_level)304url_checker.check_urls305306307end_time = Time.now308# Calculate and display the total time taken309elapsed_time = end_time - start_time310minutes = (elapsed_time / 60).to_i311seconds = (elapsed_time % 60).to_i312313puts "\nTotal time taken: #{minutes} minutes and #{seconds} seconds"314end315316317