CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/anemone/page.rb
Views: 1904
1
require 'nokogiri'
2
require 'ostruct'
3
require 'webrick/cookie'
4
5
module Anemone
6
7
# Path extractor container namespace.
8
module Extractors
9
class Base
10
attr_reader :page
11
12
def initialize( page )
13
@page = page
14
end
15
16
def doc
17
page.doc
18
end
19
end
20
end
21
22
class Page
23
24
# The URL of the page
25
attr_reader :url
26
# The raw HTTP response body of the page
27
attr_reader :body
28
# Headers of the HTTP response
29
attr_reader :headers
30
# URL of the page this one redirected to, if any
31
attr_reader :redirect_to
32
# Exception object, if one was raised during HTTP#fetch_page
33
attr_reader :error
34
35
# OpenStruct for user-stored data
36
attr_accessor :data
37
# Integer response code of the page
38
attr_accessor :code
39
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
40
attr_accessor :visited
41
# Depth of this page from the root of the crawl. This is not necessarily the
42
# shortest path; use PageStore#shortest_paths! to find that value.
43
attr_accessor :depth
44
# URL of the page that brought us to this page
45
attr_accessor :referer
46
# Response time of the request for this page in milliseconds
47
attr_accessor :response_time
48
# Storage for the original HTTP request that generated this response
49
attr_accessor :request
50
51
#
52
# Create a new page
53
#
54
def initialize(url, params = {})
55
@url = url
56
@data = OpenStruct.new
57
58
@dirbust = params[:dirbust]
59
@code = params[:code]
60
@headers = params[:headers] || {}
61
@headers['content-type'] ||= ['']
62
@aliases = Array(params[:aka]).compact
63
@referer = params[:referer]
64
@depth = params[:depth] || 0
65
@redirect_to = to_absolute(params[:redirect_to])
66
@response_time = params[:response_time]
67
@body = params[:body]
68
@error = params[:error]
69
70
@fetched = !params[:code].nil?
71
end
72
73
def self.extractors
74
return @extractors if @extractors
75
76
lib = File.dirname( __FILE__ ) + '/extractors/*.rb'
77
Dir.glob( lib ).each { |e| require e }
78
79
@extractors = Extractors.constants.map do |e|
80
next if e == :Base
81
Extractors.const_get( e )
82
end.compact
83
end
84
85
def run_extractors
86
return [] if !doc
87
self.class.extractors.map do |e|
88
next if e == Extractors::Dirbuster && !dirbust?
89
e.new( self ).run rescue next
90
end.flatten.
91
compact.map do |p|
92
abs = to_absolute( URI( p ) ) rescue next
93
!in_domain?( abs ) ? nil : abs
94
end.compact.uniq
95
end
96
97
#
98
# Array of distinct A tag HREFs from the page
99
#
100
# MODIFIED: Dig URLs from elements other than "A" refs
101
#
102
def links
103
@links ||= run_extractors
104
end
105
106
#
107
# Nokogiri document for the HTML body
108
#
109
def doc
110
return @doc if @doc
111
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
112
end
113
114
#
115
# Delete the Nokogiri document and response body to conserve memory
116
#
117
def discard_doc!
118
links # force parsing of page links before we trash the document
119
@doc = @body = nil
120
end
121
122
#
123
# Was the page successfully fetched?
124
# +true+ if the page was fetched with no error, +false+ otherwise.
125
#
126
def fetched?
127
@fetched
128
end
129
130
#
131
# Array of cookies received with this page as WEBrick::Cookie objects.
132
#
133
def cookies
134
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
135
end
136
137
#
138
# The content-type returned by the HTTP request for this page
139
#
140
def content_type
141
res = headers['content-type']
142
res = res.first if res.kind_of?(::Array)
143
res
144
end
145
146
#
147
# Returns +true+ if the page is a HTML document, returns +false+
148
# otherwise.
149
#
150
def html?
151
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
152
end
153
154
#
155
# Returns +true+ if the page is a HTTP redirect, returns +false+
156
# otherwise.
157
#
158
def redirect?
159
(300..307).include?(@code)
160
end
161
162
#
163
# Returns +true+ if the page was not found (returned 404 code),
164
# returns +false+ otherwise.
165
#
166
def not_found?
167
404 == @code
168
end
169
170
#
171
# Converts relative URL *link* into an absolute URL based on the
172
# location of the page
173
#
174
def to_absolute(link)
175
return nil if link.nil?
176
177
# remove anchor
178
link = URI::DEFAULT_PARSER.escape(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
179
180
relative = URI(link)
181
absolute = @url.merge(relative)
182
183
absolute.path = '/' if absolute.path.empty?
184
185
return absolute
186
end
187
188
def dirbust?
189
@dirbust
190
end
191
192
#
193
# Returns +true+ if *uri* is in the same domain as the page, returns
194
# +false+ otherwise
195
#
196
def in_domain?(uri)
197
uri.host == @url.host
198
end
199
200
def marshal_dump
201
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
202
end
203
204
def marshal_load(ary)
205
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
206
end
207
208
def to_hash
209
{'url' => @url.to_s,
210
'headers' => Marshal.dump(@headers),
211
'data' => Marshal.dump(@data),
212
'body' => @body,
213
'links' => links.map(&:to_s),
214
'code' => @code,
215
'visited' => @visited,
216
'depth' => @depth,
217
'referer' => @referer.to_s,
218
'redirect_to' => @redirect_to.to_s,
219
'response_time' => @response_time,
220
'fetched' => @fetched}
221
end
222
223
def self.from_hash(hash)
224
page = self.new(URI(hash['url']))
225
{'@headers' => Marshal.load(hash['headers']),
226
'@data' => Marshal.load(hash['data']),
227
'@body' => hash['body'],
228
'@links' => hash['links'].map { |link| URI(link) },
229
'@code' => hash['code'].to_i,
230
'@visited' => hash['visited'],
231
'@depth' => hash['depth'].to_i,
232
'@referer' => hash['referer'],
233
'@redirect_to' => URI(hash['redirect_to']),
234
'@response_time' => hash['response_time'].to_i,
235
'@fetched' => hash['fetched']
236
}.each do |var, value|
237
page.instance_variable_set(var, value)
238
end
239
page
240
end
241
242
def dup
243
Marshal.load( Marshal.dump( self ) )
244
end
245
246
end
247
end
248
249