Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/lib/anemone/cli/cron.rb
Views: 11779
require 'anemone'1require 'optparse'2require 'ostruct'34options = OpenStruct.new5options.relative = false6options.output_file = 'urls.txt'78begin9# make sure that the last argument is a URL we can crawl10root = URI(ARGV.last)11rescue12puts <<-INFO13Usage:14anemone cron [options] <url>1516Synopsis:17Combination of `count`, `pagedepth` and `url-list` commands.18Performs pagedepth, url list, and count functionality.19Outputs results to STDOUT and link list to file (urls.txt).20Meant to be run daily as a cron job.2122Options:23-r, --relative Output relative URLs (rather than absolute)24-o, --output filename Filename to save URL list to. Defaults to urls.txt.25INFO26exit(0)27end2829# parse command-line options30opts = OptionParser.new31opts.on('-r', '--relative') { options.relative = true }32opts.on('-o', '--output filename') {|o| options.output_file = o }33opts.parse!(ARGV)3435Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|3637anemone.after_crawl do |pages|38puts "Crawl results for #{root}\n"3940# print a list of 404's41not_found = []42pages.each_value do |page|43url = page.url.to_s44not_found << url if page.not_found?45end46unless not_found.empty?47puts "\n404's:"4849missing_links = pages.urls_linking_to(not_found)50missing_links.each do |url, links|51if options.relative52puts URI(url).path.to_s53else54puts url55end56links.slice(0..10).each do |u|57u = u.path if options.relative58puts " linked from #{u}"59end6061puts " ..." if links.size > 1062end6364print "\n"65end6667# remove redirect aliases, and calculate pagedepths68pages = pages.shortest_paths!(root).uniq69depths = pages.values.inject({}) do |depths, page|70depths[page.depth] ||= 071depths[page.depth] += 172depths73end7475# print the page count76puts "Total pages: #{pages.size}\n"7778# print a list of depths79depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }8081# output a list of urls to file82file = open(options.output_file, 'w')83pages.each_key do |url|84url = options.relative ? url.path.to_s : url.to_s85file.puts url86end87end8889end909192