CoCalc -- page.rb

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/anemone/page.rb
¹⁹⁵⁶⁶ views
1
require 'nokogiri'
2
require 'ostruct'
3
require 'webrick/cookie'
4

5
module Anemone
6

7
  # Path extractor container namespace.
8
  module Extractors
9
    class Base
10
      attr_reader :page
11

12
      def initialize( page )
13
        @page = page
14
      end
15

16
      def doc
17
        page.doc
18
      end
19
    end
20
  end
21

22
  class Page
23

24
    # The URL of the page
25
    attr_reader :url
26
    # The raw HTTP response body of the page
27
    attr_reader :body
28
    # Headers of the HTTP response
29
    attr_reader :headers
30
    # URL of the page this one redirected to, if any
31
    attr_reader :redirect_to
32
    # Exception object, if one was raised during HTTP#fetch_page
33
    attr_reader :error
34

35
    # OpenStruct for user-stored data
36
    attr_accessor :data
37
    # Integer response code of the page
38
    attr_accessor :code
39
    # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
40
    attr_accessor :visited
41
    # Depth of this page from the root of the crawl. This is not necessarily the
42
    # shortest path; use PageStore#shortest_paths! to find that value.
43
    attr_accessor :depth
44
    # URL of the page that brought us to this page
45
    attr_accessor :referer
46
    # Response time of the request for this page in milliseconds
47
    attr_accessor :response_time
48
    # Storage for the original HTTP request that generated this response
49
    attr_accessor :request
50

51
    #
52
    # Create a new page
53
    #
54
    def initialize(url, params = {})
55
      @url = url
56
      @data = OpenStruct.new
57

58
      @dirbust = params[:dirbust]
59
      @code = params[:code]
60
      @headers = params[:headers] || {}
61
      @headers['content-type'] ||= ['']
62
      @aliases = Array(params[:aka]).compact
63
      @referer = params[:referer]
64
      @depth = params[:depth] || 0
65
      @redirect_to = to_absolute(params[:redirect_to])
66
      @response_time = params[:response_time]
67
      @body = params[:body]
68
      @error = params[:error]
69

70
      @fetched = !params[:code].nil?
71
    end
72

73
    def self.extractors
74
      return @extractors if @extractors
75

76
      lib = File.dirname( __FILE__ ) + '/extractors/*.rb'
77
      Dir.glob( lib ).each { |e| require e }
78

79
      @extractors = Extractors.constants.map do |e|
80
          next if e == :Base
81
          Extractors.const_get( e )
82
      end.compact
83
    end
84

85
    def run_extractors
86
      return [] if !doc
87
      self.class.extractors.map do |e|
88
        next if e == Extractors::Dirbuster && !dirbust?
89
        e.new( self ).run rescue next
90
      end.flatten.
91
          compact.map do |p|
92
              abs = to_absolute( URI( p ) ) rescue next
93
              !in_domain?( abs ) ? nil : abs
94
          end.compact.uniq
95
    end
96

97
    #
98
    # Array of distinct A tag HREFs from the page
99
    #
100
     # MODIFIED: Dig URLs from elements other than "A" refs
101
     #
102
    def links
103
      @links ||= run_extractors
104
    end
105

106
    #
107
    # Nokogiri document for the HTML body
108
    #
109
    def doc
110
      return @doc if @doc
111
      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
112
    end
113

114
    #
115
    # Delete the Nokogiri document and response body to conserve memory
116
    #
117
    def discard_doc!
118
      links # force parsing of page links before we trash the document
119
      @doc = @body = nil
120
    end
121

122
    #
123
    # Was the page successfully fetched?
124
    # +true+ if the page was fetched with no error, +false+ otherwise.
125
    #
126
    def fetched?
127
      @fetched
128
    end
129

130
    #
131
    # Array of cookies received with this page as WEBrick::Cookie objects.
132
    #
133
    def cookies
134
      WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
135
    end
136

137
    #
138
    # The content-type returned by the HTTP request for this page
139
    #
140
    def content_type
141
      res = headers['content-type']
142
      res = res.first if res.kind_of?(::Array)
143
      res
144
    end
145

146
    #
147
    # Returns +true+ if the page is a HTML document, returns +false+
148
    # otherwise.
149
    #
150
    def html?
151
      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
152
    end
153

154
    #
155
    # Returns +true+ if the page is a HTTP redirect, returns +false+
156
    # otherwise.
157
    #
158
    def redirect?
159
      (300..307).include?(@code)
160
    end
161

162
    #
163
    # Returns +true+ if the page was not found (returned 404 code),
164
    # returns +false+ otherwise.
165
    #
166
    def not_found?
167
      404 == @code
168
    end
169

170
    #
171
    # Converts relative URL *link* into an absolute URL based on the
172
    # location of the page
173
    #
174
    def to_absolute(link)
175
      return nil if link.nil?
176

177
      # remove anchor
178
      link = URI::DEFAULT_PARSER.escape(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
179

180
      relative = URI(link)
181
      absolute = @url.merge(relative)
182

183
      absolute.path = '/' if absolute.path.empty?
184

185
      return absolute
186
    end
187

188
    def dirbust?
189
      @dirbust
190
    end
191

192
    #
193
    # Returns +true+ if *uri* is in the same domain as the page, returns
194
    # +false+ otherwise
195
    #
196
    def in_domain?(uri)
197
      uri.host == @url.host
198
    end
199

200
    def marshal_dump
201
      [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
202
    end
203

204
    def marshal_load(ary)
205
      @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
206
    end
207

208
    def to_hash
209
      {'url' => @url.to_s,
210
       'headers' => Marshal.dump(@headers),
211
       'data' => Marshal.dump(@data),
212
       'body' => @body,
213
       'links' => links.map(&:to_s),
214
       'code' => @code,
215
       'visited' => @visited,
216
       'depth' => @depth,
217
       'referer' => @referer.to_s,
218
       'redirect_to' => @redirect_to.to_s,
219
       'response_time' => @response_time,
220
       'fetched' => @fetched}
221
    end
222

223
    def self.from_hash(hash)
224
      page = self.new(URI(hash['url']))
225
      {'@headers' => Marshal.load(hash['headers']),
226
       '@data' => Marshal.load(hash['data']),
227
       '@body' => hash['body'],
228
       '@links' => hash['links'].map { |link| URI(link) },
229
       '@code' => hash['code'].to_i,
230
       '@visited' => hash['visited'],
231
       '@depth' => hash['depth'].to_i,
232
       '@referer' => hash['referer'],
233
       '@redirect_to' => URI(hash['redirect_to']),
234
       '@response_time' => hash['response_time'].to_i,
235
       '@fetched' => hash['fetched']
236
      }.each do |var, value|
237
        page.instance_variable_set(var, value)
238
      end
239
      page
240
    end
241

242
    def dup
243
    Marshal.load( Marshal.dump( self ) )
244
    end
245

246
  end
247
end
248

249
Product

Resources

Company