CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/anemone/cli/cron.rb
Views: 1904
1
require 'anemone'
2
require 'optparse'
3
require 'ostruct'
4
5
options = OpenStruct.new
6
options.relative = false
7
options.output_file = 'urls.txt'
8
9
begin
10
# make sure that the last argument is a URL we can crawl
11
root = URI(ARGV.last)
12
rescue
13
puts <<-INFO
14
Usage:
15
anemone cron [options] <url>
16
17
Synopsis:
18
Combination of `count`, `pagedepth` and `url-list` commands.
19
Performs pagedepth, url list, and count functionality.
20
Outputs results to STDOUT and link list to file (urls.txt).
21
Meant to be run daily as a cron job.
22
23
Options:
24
-r, --relative Output relative URLs (rather than absolute)
25
-o, --output filename Filename to save URL list to. Defaults to urls.txt.
26
INFO
27
exit(0)
28
end
29
30
# parse command-line options
31
opts = OptionParser.new
32
opts.on('-r', '--relative') { options.relative = true }
33
opts.on('-o', '--output filename') {|o| options.output_file = o }
34
opts.parse!(ARGV)
35
36
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
38
anemone.after_crawl do |pages|
39
puts "Crawl results for #{root}\n"
40
41
# print a list of 404's
42
not_found = []
43
pages.each_value do |page|
44
url = page.url.to_s
45
not_found << url if page.not_found?
46
end
47
unless not_found.empty?
48
puts "\n404's:"
49
50
missing_links = pages.urls_linking_to(not_found)
51
missing_links.each do |url, links|
52
if options.relative
53
puts URI(url).path.to_s
54
else
55
puts url
56
end
57
links.slice(0..10).each do |u|
58
u = u.path if options.relative
59
puts " linked from #{u}"
60
end
61
62
puts " ..." if links.size > 10
63
end
64
65
print "\n"
66
end
67
68
# remove redirect aliases, and calculate pagedepths
69
pages = pages.shortest_paths!(root).uniq
70
depths = pages.values.inject({}) do |depths, page|
71
depths[page.depth] ||= 0
72
depths[page.depth] += 1
73
depths
74
end
75
76
# print the page count
77
puts "Total pages: #{pages.size}\n"
78
79
# print a list of depths
80
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
82
# output a list of urls to file
83
file = open(options.output_file, 'w')
84
pages.each_key do |url|
85
url = options.relative ? url.path.to_s : url.to_s
86
file.puts url
87
end
88
end
89
90
end
91
92