CoCalc -- core.rb

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/anemone/core.rb
¹⁹⁸¹³ views
1
require 'thread'
2
require 'robots'
3
require 'anemone/tentacle'
4
require 'anemone/page'
5
require 'anemone/exceptions'
6
require 'anemone/page_store'
7
require 'anemone/storage'
8
require 'anemone/storage/base'
9

10
module Anemone
11

12
  VERSION = '0.5.0'
13

14
  #
15
  # Convenience method to start a crawl
16
  #
17
  def Anemone.crawl(urls, options = {}, &block)
18
    Core.crawl(urls, options, &block)
19
  end
20

21
  class Core
22

23
    # PageStore storing all Page objects encountered during the crawl
24
    attr_reader :pages
25
    # Hash of options for the crawl
26
    attr_reader :opts
27

28
    DEFAULT_OPTS = {
29
      # run 4 Tentacle threads to fetch pages
30
      :threads => 4,
31
      # disable verbose output
32
      :verbose => false,
33
      # don't throw away the page response body after scanning it for links
34
      :discard_page_bodies => false,
35
      # identify self as Anemone/VERSION
36
      :user_agent => "Anemone/#{Anemone::VERSION}",
37
      # no delay between requests
38
      :delay => 0,
39
      # don't obey the robots exclusion protocol
40
      :obey_robots_txt => false,
41
      # by default, don't limit the depth of the crawl
42
      :depth_limit => false,
43
      # number of times HTTP redirects will be followed
44
      :redirect_limit => 5,
45
      # storage engine defaults to Hash in +process_options+ if none specified
46
      :storage => nil,
47
      # hash of cookie name => value to send with HTTP requests
48
      :cookies => nil,
49
      # basic authentication data to send with HTTP requests
50
      :http_basic_auth => nil,
51
      # array or raw header lines to inject into each request
52
      :inject_headers => [],
53
      # accept cookies from the server and send them back?
54
      :accept_cookies => false,
55
      # skip any link with a query string? e.g. http://foo.com/?u=user
56
      :skip_query_strings => false,
57
      :dirbust  => true
58
    }
59

60
    # Create setter methods for all options to be called from the crawl block
61
    DEFAULT_OPTS.keys.each do |key|
62
      define_method "#{key}=" do |value|
63
        @opts[key.to_sym] = value
64
      end
65
    end
66

67
    #
68
    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
69
    # and optional *block*
70
    #
71
    def initialize(urls, opts = {})
72
      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
73
      @urls.each{ |url| url.path = '/' if url.path.empty? }
74

75
      @tentacles = []
76
      @on_every_page_blocks = []
77
      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
78
      @skip_link_patterns = []
79
      @after_crawl_blocks = []
80
      @opts = opts
81

82
      yield self if block_given?
83
    end
84

85
    #
86
    # Convenience method to start a new crawl
87
    #
88
    def self.crawl(urls, opts = {})
89
      self.new(urls, opts) do |core|
90
        yield core if block_given?
91
        core.run
92
      end
93
    end
94

95
    #
96
    # Add a block to be executed on the PageStore after the crawl
97
    # is finished
98
    #
99
    def after_crawl(&block)
100
      @after_crawl_blocks << block
101
      self
102
    end
103

104
    #
105
    # Add one or more Regex patterns for URLs which should not be
106
    # followed
107
    #
108
    def skip_links_like(*patterns)
109
      @skip_link_patterns.concat [patterns].flatten.compact
110
      self
111
    end
112

113
    #
114
    # Add a block to be executed on every Page as they are encountered
115
    # during the crawl
116
    #
117
    def on_every_page(&block)
118
      @on_every_page_blocks << block
119
      self
120
    end
121

122
    #
123
    # Add a block to be executed on Page objects with a URL matching
124
    # one or more patterns
125
    #
126
    def on_pages_like(*patterns, &block)
127
      if patterns
128
        patterns.each do |pattern|
129
          @on_pages_like_blocks[pattern] << block
130
        end
131
      end
132
      self
133
    end
134

135
    #
136
    # Specify a block which will select which links to follow on each page.
137
    # The block should return an Array of URI objects.
138
    #
139
    def focus_crawl(&block)
140
      @focus_crawl_block = block
141
      self
142
    end
143

144
    #
145
    # Perform the crawl
146
    #
147
    def run
148
      process_options
149

150
      @urls.delete_if { |url| !visit_link?(url) }
151
      return if @urls.empty?
152

153
      link_queue = Queue.new
154
      page_queue = Queue.new
155

156
      @opts[:threads].times do
157
        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
158
      end
159

160
      @urls.each{ |url| link_queue.enq(url) }
161

162
      loop do
163
        page = page_queue.deq
164
        @pages.touch_key page.url
165
        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
166
        do_page_blocks page
167
        page.discard_doc! if @opts[:discard_page_bodies]
168

169
        links = links_to_follow page
170
        links.each do |link|
171
          link_queue << [link, page.url.dup, page.depth + 1]
172
        end
173
        @pages.touch_keys links
174

175
        @pages[page.url] = page
176

177
        # if we are done with the crawl, tell the threads to end
178
        if link_queue.empty? and page_queue.empty?
179
          until link_queue.num_waiting == @tentacles.size
180
            Thread.pass
181
          end
182
          if page_queue.empty?
183
            @tentacles.size.times { link_queue << :END }
184
            break
185
          end
186
        end
187
      end
188

189
      @tentacles.each { |thread| thread.join }
190
      do_after_crawl_blocks
191
      self
192
    end
193

194
    private
195

196
    def process_options
197
      @opts = DEFAULT_OPTS.merge @opts
198
      @opts[:threads] = 1 if @opts[:delay] > 0
199
      storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
200
      @pages = PageStore.new(storage)
201
      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
202

203
      # freeze_options
204
    end
205

206
    #
207
    # Freeze the opts Hash so that no options can be modified
208
    # once the crawl begins
209
    #
210
    def freeze_options
211
      @opts.freeze
212
      @opts.each_key { |key| @opts[key].freeze }
213
      @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
214
    end
215

216
    #
217
    # Execute the after_crawl blocks
218
    #
219
    def do_after_crawl_blocks
220
      @after_crawl_blocks.each { |block| block.call(@pages) }
221
    end
222

223
    #
224
    # Execute the on_every_page blocks for *page*
225
    #
226
    def do_page_blocks(page)
227
      @on_every_page_blocks.each do |block|
228
        block.call(page)
229
      end
230

231
      @on_pages_like_blocks.each do |pattern, blocks|
232
        blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
233
      end
234
    end
235

236
    #
237
    # Return an Array of links to follow from the given page.
238
    # Based on whether or not the link has already been crawled,
239
    # and the block given to focus_crawl()
240
    #
241
    def links_to_follow(page)
242
      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
243
      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
244
    end
245

246
    #
247
    # Returns +true+ if *link* has not been visited already,
248
    # and is not excluded by a skip_link pattern...
249
    # and is not excluded by robots.txt...
250
    # and is not deeper than the depth limit
251
    # Returns +false+ otherwise.
252
    #
253
    def visit_link?(link, from_page = nil)
254
      !@pages.has_page?(link) &&
255
      !skip_link?(link) &&
256
      !skip_query_string?(link) &&
257
      allowed(link) &&
258
      !too_deep?(from_page)
259
    end
260

261
    #
262
    # Returns +true+ if we are obeying robots.txt and the link
263
    # is granted access in it. Always returns +true+ when we are
264
    # not obeying robots.txt.
265
    #
266
    def allowed(link)
267
      @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
268
    end
269

270
    #
271
    # Returns +true+ if we are over the page depth limit.
272
    # This only works when coming from a page and with the +depth_limit+ option set.
273
    # When neither is the case, will always return +false+.
274
    def too_deep?(from_page)
275
      if from_page && @opts[:depth_limit]
276
        from_page.depth >= @opts[:depth_limit]
277
      else
278
        false
279
      end
280
    end
281

282
    #
283
    # Returns +true+ if *link* should not be visited because
284
    # it has a query string and +skip_query_strings+ is true.
285
    #
286
    def skip_query_string?(link)
287
      @opts[:skip_query_strings] && link.query
288
    end
289

290
    #
291
    # Returns +true+ if *link* should not be visited because
292
    # its URL matches a skip_link pattern.
293
    #
294
    def skip_link?(link)
295
      @skip_link_patterns.any? { |pattern| link.path =~ pattern }
296
    end
297

298
    #
299
    # Kills all active threads
300
    #
301
    def shutdown
302
      @tentacles.each {|t| t.kill rescue nil }
303
      @pages = nil
304
    end
305

306
  end
307
end
308

309
Product

Resources

Company