require 'nokogiri'
require 'ostruct'
require 'webrick/cookie'
module Anemone
module Extractors
class Base
attr_reader :page
def initialize( page )
@page = page
end
def doc
page.doc
end
end
end
class Page
attr_reader :url
attr_reader :body
attr_reader :headers
attr_reader :redirect_to
attr_reader :error
attr_accessor :data
attr_accessor :code
attr_accessor :visited
attr_accessor :depth
attr_accessor :referer
attr_accessor :response_time
attr_accessor :request
def initialize(url, params = {})
@url = url
@data = OpenStruct.new
@dirbust = params[:dirbust]
@code = params[:code]
@headers = params[:headers] || {}
@headers['content-type'] ||= ['']
@aliases = Array(params[:aka]).compact
@referer = params[:referer]
@depth = params[:depth] || 0
@redirect_to = to_absolute(params[:redirect_to])
@response_time = params[:response_time]
@body = params[:body]
@error = params[:error]
@fetched = !params[:code].nil?
end
def self.extractors
return @extractors if @extractors
lib = File.dirname( __FILE__ ) + '/extractors/*.rb'
Dir.glob( lib ).each { |e| require e }
@extractors = Extractors.constants.map do |e|
next if e == :Base
Extractors.const_get( e )
end.compact
end
def run_extractors
return [] if !doc
self.class.extractors.map do |e|
next if e == Extractors::Dirbuster && !dirbust?
e.new( self ).run rescue next
end.flatten.
compact.map do |p|
abs = to_absolute( URI( p ) ) rescue next
!in_domain?( abs ) ? nil : abs
end.compact.uniq
end
def links
@links ||= run_extractors
end
def doc
return @doc if @doc
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end
def discard_doc!
links
@doc = @body = nil
end
def fetched?
@fetched
end
def cookies
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end
def content_type
res = headers['content-type']
res = res.first if res.kind_of?(::Array)
res
end
def html?
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end
def redirect?
(300..307).include?(@code)
end
def not_found?
404 == @code
end
def to_absolute(link)
return nil if link.nil?
link = URI::DEFAULT_PARSER.escape(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
relative = URI(link)
absolute = @url.merge(relative)
absolute.path = '/' if absolute.path.empty?
return absolute
end
def dirbust?
@dirbust
end
def in_domain?(uri)
uri.host == @url.host
end
def marshal_dump
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end
def marshal_load(ary)
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end
def to_hash
{'url' => @url.to_s,
'headers' => Marshal.dump(@headers),
'data' => Marshal.dump(@data),
'body' => @body,
'links' => links.map(&:to_s),
'code' => @code,
'visited' => @visited,
'depth' => @depth,
'referer' => @referer.to_s,
'redirect_to' => @redirect_to.to_s,
'response_time' => @response_time,
'fetched' => @fetched}
end
def self.from_hash(hash)
page = self.new(URI(hash['url']))
{'@headers' => Marshal.load(hash['headers']),
'@data' => Marshal.load(hash['data']),
'@body' => hash['body'],
'@links' => hash['links'].map { |link| URI(link) },
'@code' => hash['code'].to_i,
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
'@redirect_to' => URI(hash['redirect_to']),
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
page.instance_variable_set(var, value)
end
page
end
def dup
Marshal.load( Marshal.dump( self ) )
end
end
end