CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/anemone/core.rb
Views: 1904
1
require 'thread'
2
require 'robots'
3
require 'anemone/tentacle'
4
require 'anemone/page'
5
require 'anemone/exceptions'
6
require 'anemone/page_store'
7
require 'anemone/storage'
8
require 'anemone/storage/base'
9
10
module Anemone
11
12
VERSION = '0.5.0'
13
14
#
15
# Convenience method to start a crawl
16
#
17
def Anemone.crawl(urls, options = {}, &block)
18
Core.crawl(urls, options, &block)
19
end
20
21
class Core
22
23
# PageStore storing all Page objects encountered during the crawl
24
attr_reader :pages
25
# Hash of options for the crawl
26
attr_reader :opts
27
28
DEFAULT_OPTS = {
29
# run 4 Tentacle threads to fetch pages
30
:threads => 4,
31
# disable verbose output
32
:verbose => false,
33
# don't throw away the page response body after scanning it for links
34
:discard_page_bodies => false,
35
# identify self as Anemone/VERSION
36
:user_agent => "Anemone/#{Anemone::VERSION}",
37
# no delay between requests
38
:delay => 0,
39
# don't obey the robots exclusion protocol
40
:obey_robots_txt => false,
41
# by default, don't limit the depth of the crawl
42
:depth_limit => false,
43
# number of times HTTP redirects will be followed
44
:redirect_limit => 5,
45
# storage engine defaults to Hash in +process_options+ if none specified
46
:storage => nil,
47
# hash of cookie name => value to send with HTTP requests
48
:cookies => nil,
49
# basic authentication data to send with HTTP requests
50
:http_basic_auth => nil,
51
# array or raw header lines to inject into each request
52
:inject_headers => [],
53
# accept cookies from the server and send them back?
54
:accept_cookies => false,
55
# skip any link with a query string? e.g. http://foo.com/?u=user
56
:skip_query_strings => false,
57
:dirbust => true
58
}
59
60
# Create setter methods for all options to be called from the crawl block
61
DEFAULT_OPTS.keys.each do |key|
62
define_method "#{key}=" do |value|
63
@opts[key.to_sym] = value
64
end
65
end
66
67
#
68
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
69
# and optional *block*
70
#
71
def initialize(urls, opts = {})
72
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
73
@urls.each{ |url| url.path = '/' if url.path.empty? }
74
75
@tentacles = []
76
@on_every_page_blocks = []
77
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
78
@skip_link_patterns = []
79
@after_crawl_blocks = []
80
@opts = opts
81
82
yield self if block_given?
83
end
84
85
#
86
# Convenience method to start a new crawl
87
#
88
def self.crawl(urls, opts = {})
89
self.new(urls, opts) do |core|
90
yield core if block_given?
91
core.run
92
end
93
end
94
95
#
96
# Add a block to be executed on the PageStore after the crawl
97
# is finished
98
#
99
def after_crawl(&block)
100
@after_crawl_blocks << block
101
self
102
end
103
104
#
105
# Add one or more Regex patterns for URLs which should not be
106
# followed
107
#
108
def skip_links_like(*patterns)
109
@skip_link_patterns.concat [patterns].flatten.compact
110
self
111
end
112
113
#
114
# Add a block to be executed on every Page as they are encountered
115
# during the crawl
116
#
117
def on_every_page(&block)
118
@on_every_page_blocks << block
119
self
120
end
121
122
#
123
# Add a block to be executed on Page objects with a URL matching
124
# one or more patterns
125
#
126
def on_pages_like(*patterns, &block)
127
if patterns
128
patterns.each do |pattern|
129
@on_pages_like_blocks[pattern] << block
130
end
131
end
132
self
133
end
134
135
#
136
# Specify a block which will select which links to follow on each page.
137
# The block should return an Array of URI objects.
138
#
139
def focus_crawl(&block)
140
@focus_crawl_block = block
141
self
142
end
143
144
#
145
# Perform the crawl
146
#
147
def run
148
process_options
149
150
@urls.delete_if { |url| !visit_link?(url) }
151
return if @urls.empty?
152
153
link_queue = Queue.new
154
page_queue = Queue.new
155
156
@opts[:threads].times do
157
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
158
end
159
160
@urls.each{ |url| link_queue.enq(url) }
161
162
loop do
163
page = page_queue.deq
164
@pages.touch_key page.url
165
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
166
do_page_blocks page
167
page.discard_doc! if @opts[:discard_page_bodies]
168
169
links = links_to_follow page
170
links.each do |link|
171
link_queue << [link, page.url.dup, page.depth + 1]
172
end
173
@pages.touch_keys links
174
175
@pages[page.url] = page
176
177
# if we are done with the crawl, tell the threads to end
178
if link_queue.empty? and page_queue.empty?
179
until link_queue.num_waiting == @tentacles.size
180
Thread.pass
181
end
182
if page_queue.empty?
183
@tentacles.size.times { link_queue << :END }
184
break
185
end
186
end
187
end
188
189
@tentacles.each { |thread| thread.join }
190
do_after_crawl_blocks
191
self
192
end
193
194
private
195
196
def process_options
197
@opts = DEFAULT_OPTS.merge @opts
198
@opts[:threads] = 1 if @opts[:delay] > 0
199
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
200
@pages = PageStore.new(storage)
201
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
202
203
# freeze_options
204
end
205
206
#
207
# Freeze the opts Hash so that no options can be modified
208
# once the crawl begins
209
#
210
def freeze_options
211
@opts.freeze
212
@opts.each_key { |key| @opts[key].freeze }
213
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
214
end
215
216
#
217
# Execute the after_crawl blocks
218
#
219
def do_after_crawl_blocks
220
@after_crawl_blocks.each { |block| block.call(@pages) }
221
end
222
223
#
224
# Execute the on_every_page blocks for *page*
225
#
226
def do_page_blocks(page)
227
@on_every_page_blocks.each do |block|
228
block.call(page)
229
end
230
231
@on_pages_like_blocks.each do |pattern, blocks|
232
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
233
end
234
end
235
236
#
237
# Return an Array of links to follow from the given page.
238
# Based on whether or not the link has already been crawled,
239
# and the block given to focus_crawl()
240
#
241
def links_to_follow(page)
242
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
243
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
244
end
245
246
#
247
# Returns +true+ if *link* has not been visited already,
248
# and is not excluded by a skip_link pattern...
249
# and is not excluded by robots.txt...
250
# and is not deeper than the depth limit
251
# Returns +false+ otherwise.
252
#
253
def visit_link?(link, from_page = nil)
254
!@pages.has_page?(link) &&
255
!skip_link?(link) &&
256
!skip_query_string?(link) &&
257
allowed(link) &&
258
!too_deep?(from_page)
259
end
260
261
#
262
# Returns +true+ if we are obeying robots.txt and the link
263
# is granted access in it. Always returns +true+ when we are
264
# not obeying robots.txt.
265
#
266
def allowed(link)
267
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
268
end
269
270
#
271
# Returns +true+ if we are over the page depth limit.
272
# This only works when coming from a page and with the +depth_limit+ option set.
273
# When neither is the case, will always return +false+.
274
def too_deep?(from_page)
275
if from_page && @opts[:depth_limit]
276
from_page.depth >= @opts[:depth_limit]
277
else
278
false
279
end
280
end
281
282
#
283
# Returns +true+ if *link* should not be visited because
284
# it has a query string and +skip_query_strings+ is true.
285
#
286
def skip_query_string?(link)
287
@opts[:skip_query_strings] && link.query
288
end
289
290
#
291
# Returns +true+ if *link* should not be visited because
292
# its URL matches a skip_link pattern.
293
#
294
def skip_link?(link)
295
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
296
end
297
298
#
299
# Kills all active threads
300
#
301
def shutdown
302
@tentacles.each {|t| t.kill rescue nil }
303
@pages = nil
304
end
305
306
end
307
end
308
309