CoCalc -- detect_dead_reference

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/tools/dev/detect_dead_reference_links.rb
Views: ¹⁷⁹⁵¹
1
##
2
# This script checks the status of URLs from a provided JSON file.
3
# It validates if URLs are alive, handles redirects, and fetches Wayback Machine snapshots for URLs that are down.
4
# It logs the status of each URL, including errors, redirects, and archived snapshots.
5
#
6
# Usage: ruby tools/dev/detect_dead_reference_links.rb -f db/modules_metadata_base.json -l WARN
7
#
8

9
require 'net/http'
10
require 'uri'
11
require 'json'
12
require 'csv'
13
require 'concurrent'
14
require 'logger'
15
require 'fileutils'
16
require 'optparse'
17
require 'benchmark'
18

19
class UrlChecker
20
  WAYBACK_MACHINE_API_URL = 'https://archive.org/wayback/available?url='
21
  MAX_REDIRECTS = 5 # Maximum number of redirects to follow for each URL
22
  THREAD_POOL_SIZE = 5 # Number of threads in the pool to process URLs concurrently
23
  CHECKED_URLS_FILE = 'checked_urls.jsonl' # File to save URLs that have been checked
24
  BATCH_SIZE = 1000 # Number of URLs to process in each batch
25
  MAX_RETRIES = 3 # Maximum number of retries for failed requests to the Wayback Machine
26
  RETRY_DELAY = 5 # Delay in seconds between retries
27

28
  # Initializes the UrlChecker instance with given URLs and configuration options
29
  # @param [Array<Hash>] urls_with_paths A list of URLs with associated paths to check
30
  # @param [Logger::Level] log_level The logging level (defaults to Logger::INFO)
31
  def initialize(urls_with_paths, log_level: Logger::INFO)
32
    @urls_with_paths = urls_with_paths
33
    @results = []
34
    @checked_urls = load_checked_urls
35
    @url_times = []
36
    @logger = Logger.new($stdout)
37
    @logger.level = log_level
38
    @total_urls = urls_with_paths.size
39
    @processed_urls = 0
40
  end
41

42
  # Starts the process of checking all URLs in batches, logging results and saving progress
43
  # in a thread-safe manner.
44
  def check_urls
45
    pool = Concurrent::FixedThreadPool.new(THREAD_POOL_SIZE)
46
    at_exit { shutdown_thread_pool(pool) }
47

48
    # Process URLs in batches to avoid overwhelming the system
49
    @urls_with_paths.each_slice(BATCH_SIZE) do |batch|
50
      futures = batch.map do |url_with_path|
51
        Concurrent::Promises.future(pool) do
52
          result = check_url(url_with_path)
53
          @results << result
54
          @checked_urls << url_with_path[:url]
55
          save_progress(result)
56

57
          update_progress
58
        end
59
      end
60

61
      # Wait for all futures in the current batch to finish before proceeding
62
      Concurrent::Promises.zip(*futures).wait!
63

64
      # Sleep between batches to avoid overloading the server
65
      sleep 5
66
    end
67

68
    save_results_to_file
69
  ensure
70
    pool.shutdown
71
    pool.wait_for_termination
72
    @logger.info('Finished checking URLs.')
73
  end
74

75
  private
76

77
  # Filters out URLs that have already been checked.
78
  # @return [Array<Hash>] List of URLs and paths that have not been checked yet
79
  def unchecked_urls
80
    @urls_with_paths.reject { |url_with_path| @checked_urls.include?(url_with_path[:url]) }
81
  end
82

83
  # Checks a single URL and processes its response.
84
  # @param [Hash] url_with_path The URL and its associated path to check
85
  # @return [Hash] A result containing the URL, path, status, and archived snapshot (if available)
86
  def check_url(url_with_path)
87
    url_result = { url: url_with_path[:url], path: url_with_path[:path], status: nil, archived_snapshot: nil }
88

89
    # Skip non-URL references and Wayback links
90
    if !url_with_path[:url].start_with?('URL-')
91
      url_result[:status] = 'Skipped (not a URL- reference)'
92
      return url_result
93
    elsif url_with_path[:url].start_with?('http://web.archive.org/web')
94
      url_result[:status] = 'Wayback link (skipped)'
95
      return url_result
96
    end
97

98
    # Clean the URL and validate it
99
    cleaned_url = url_with_path[:url].sub(/^URL-/, '')
100

101
    # Check if the URL is valid
102
    if !valid_url?(cleaned_url)
103
      url_result[:status] = 'Invalid URL'
104
      return url_result
105
    end
106

107
    # Prepare the HTTP request
108
    uri = URI.parse(cleaned_url)
109
    http = Net::HTTP.new(uri.host, uri.port)
110
    http.use_ssl = uri.scheme == 'https'
111

112
    start_time = Time.now
113

114
    begin
115
      # Get the HTTP response and handle redirects
116
      response = get_response(http, uri)
117
      follow_redirects(http, uri, response)
118
    rescue StandardError => e
119
      handle_error(url_result, e)
120
    end
121

122
    # Process the response (check for success, failure, or error)
123
    process_response(response, url_result)
124
    elapsed_time = Time.now - start_time
125
    @url_times << elapsed_time
126

127
    url_result
128
  ensure
129
    save_progress(url_result)
130
  end
131

132
  # Validates if a URL is properly formatted
133
  # @param [String] url The URL to check
134
  # @return [Boolean] True if the URL is valid, false otherwise
135
  def valid_url?(url)
136
    URI.parse(url).is_a?(URI::HTTP)
137
  rescue StandardError
138
    false
139
  end
140

141
  # Sends an HTTP GET request to the specified URI
142
  # @param [Net::HTTP] http The HTTP client
143
  # @param [URI] uri The URI to send the GET request to
144
  # @return [Net::HTTPResponse] The HTTP response
145
  def get_response(http, uri)
146
    http.get(uri.request_uri)
147
  end
148

149
  # Follows HTTP redirects up to a maximum limit (MAX_REDIRECTS)
150
  # @param [Net::HTTP] http The HTTP client
151
  # @param [URI] uri The original URI
152
  # @param [Net::HTTPResponse] response The HTTP response to process
153
  def follow_redirects(http, uri, response)
154
    redirect_count = 0
155
    while response.is_a?(Net::HTTPRedirection) && redirect_count < MAX_REDIRECTS
156
      location = response['location']
157
      @logger.info("Redirecting to: #{location}")
158
      uri = URI.parse(location)
159
      response = http.get(uri.request_uri)
160
      redirect_count += 1
161
    end
162
  end
163

164
  # Processes the HTTP response to determine the URL status
165
  # @param [Net::HTTPResponse] response The HTTP response to process
166
  # @param [Hash] url_result The result hash to update with the status
167
  def process_response(response, url_result)
168
    if response.nil?
169
      url_result[:status] = 'Error: No response received'
170
    elsif response.is_a?(Net::HTTPSuccess) || response.is_a?(Net::HTTPRedirection)
171
      url_result[:status] = 'Alive'
172
    else
173
      url_result[:status] = "Not Alive (Status Code: #{response.code})"
174
      fetch_wayback_snapshot(url_result)
175
    end
176
  end
177

178
  # Handles errors encountered during URL checking (e.g., network errors)
179
  # @param [Hash] url_result The result hash to update with error information
180
  # @param [StandardError] error The error that was raised
181
  def handle_error(url_result, error)
182
    url_result[:status] = "Error: #{error.message}"
183
    url_result[:archived_snapshot] = nil
184
  end
185

186
  # Attempts to fetch the Wayback Machine snapshot for the URL
187
  # @param [Hash] url_result The result hash to update with the Wayback snapshot information
188
  def fetch_wayback_snapshot(url_result)
189
    wayback_url = "#{WAYBACK_MACHINE_API_URL}#{url_result[:url]}"
190
    retries = 0
191

192
    begin
193
      uri = URI.parse(wayback_url)
194
      response = Net::HTTP.get_response(uri)
195
      handle_wayback_response(response, url_result)
196
    rescue StandardError => e
197
      retries += 1
198
      if retries <= MAX_RETRIES
199
        @logger.warn("Error fetching Wayback snapshot for #{url_result[:url]}: #{e.message}. Retrying in #{RETRY_DELAY} seconds... (Attempt #{retries} of #{MAX_RETRIES})")
200
        sleep(RETRY_DELAY)
201
        retry
202
      else
203
        url_result[:archived_snapshot] = "Error fetching Wayback snapshot after #{MAX_RETRIES} attempts: #{e.message}"
204
      end
205
    end
206
  end
207

208
  # Processes the response from the Wayback Machine API
209
  # @param [Net::HTTPResponse] response The response from the Wayback Machine
210
  # @param [Hash] url_result The result hash to update with the archived snapshot URL
211
  def handle_wayback_response(response, url_result)
212
    if response.is_a?(Net::HTTPSuccess)
213
      data = JSON.parse(response.body)
214
      snapshot = data.dig('archived_snapshots', 'closest', 'url')
215
      url_result[:archived_snapshot] = snapshot || 'No archived version found'
216
    else
217
      url_result[:archived_snapshot] = 'Error fetching Wayback Machine data'
218
    end
219
  end
220

221
  # Saves the final results of the URL checks to a JSON file
222
  def save_results_to_file
223
    File.open('url_check_results.json', 'w') { |file| file.write(JSON.pretty_generate(@results)) }
224
    @logger.info('Results have been saved to "url_check_results.json".')
225
  end
226

227
  # Saves the progress of checked URLs to a file
228
  # @param [Hash] result The result of a single URL check
229
  def save_progress(result)
230
    File.open(CHECKED_URLS_FILE, 'a') { |file| file.puts JSON.generate(result) }
231
  end
232

233
  # Loads the list of already checked URLs from the progress file
234
  # @return [Array<String>] A list of checked URLs
235
  def load_checked_urls
236
    return [] unless File.exist?(CHECKED_URLS_FILE)
237

238
    File.readlines(CHECKED_URLS_FILE).map { |row| JSON.parse(row)['url'] }
239
  end
240

241
  # Shuts down the thread pool after URL checking is complete
242
  # @param [Concurrent::FixedThreadPool] pool The thread pool to shut down
243
  def shutdown_thread_pool(pool)
244
    pool.shutdown
245
    pool.wait_for_termination
246
    @logger.info('Thread pool shut down successfully.')
247
  end
248

249
  # Updates the progress bar based on the number of URLs processed
250
  def update_progress
251
    @processed_urls += 1
252
    percentage = (@processed_urls.to_f / @total_urls * 100).round
253
    bar_length = 50
254
    progress = ('=' * (percentage / 2)).ljust(bar_length, ' ')
255
    print "\r[#{progress}] #{percentage}% (#{@processed_urls}/#{@total_urls})"
256
  end
257
end
258

259
# Main entry point to run the URL checking process
260
if __FILE__ == $PROGRAM_NAME
261
  options = {}
262
  OptionParser.new do |opts|
263
    opts.banner = 'Usage: ruby url_checker.rb [options]'
264

265
    opts.on('-f', '--file FILE', 'JSON file containing URLs and paths') do |file|
266
      options[:file] = file
267
    end
268

269
    opts.on('-l', '--log-level LEVEL', 'Log level (DEBUG, INFO, WARN, ERROR, FATAL, UNKNOWN)') do |log_level|
270
      options[:log_level] = log_level.upcase.to_sym
271
    end
272
  end.parse!
273

274
  # Validate input file
275
  unless options[:file] && File.exist?(options[:file])
276
    puts 'Please provide a valid JSON file with URLs and paths.'
277
    exit 1
278
  end
279

280
  # Handling for log level
281
  log_level = options[:log_level] || 'INFO'
282
  log_level = Logger.const_get(log_level)
283

284
  # Parse the JSON file containing URLs and paths
285
  urls_with_paths = JSON.parse(File.read(options[:file]))
286

287
  # Map the data to the format required by the checker
288
  mapped_data = urls_with_paths.flat_map do |_path, metadata|
289
    metadata['references'].map { |ref| { 'path' => metadata['path'], 'ref' => ref } }
290
  end
291

292
  # Validate the structure of the mapped data
293
  unless mapped_data.is_a?(Array) && mapped_data.all? { |entry| entry['ref'] && entry['path'] }
294
    puts "Invalid JSON structure. The file should contain an array of objects with 'ref' and 'path' keys."
295
    exit 1
296
  end
297

298
  # Create the final list of URLs and paths
299
  urls_with_paths_final = mapped_data.map { |entry| { url: entry['ref'], path: entry['path'] } }
300

301
  start_time = Time.now
302

303
  # Create and run the UrlChecker instance
304
  url_checker = UrlChecker.new(urls_with_paths_final, log_level: log_level)
305
  url_checker.check_urls
306

307

308
  end_time = Time.now
309
  # Calculate and display the total time taken
310
  elapsed_time = end_time - start_time
311
  minutes = (elapsed_time / 60).to_i
312
  seconds = (elapsed_time % 60).to_i
313

314
  puts "\nTotal time taken: #{minutes} minutes and #{seconds} seconds"
315
end
316

317
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.