require 'net/https'
require 'anemone/page'
require 'anemone/cookie_store'
module Anemone
class HTTP
REDIRECT_LIMIT = 5
attr_reader :cookie_store
def initialize(opts = {})
@connections = {}
@opts = opts
@cookie_store = CookieStore.new(@opts[:cookies])
end
def fetch_page(url, referer = nil, depth = nil)
fetch_pages(url, referer, depth).last
end
def fetch_pages(url, referer = nil, depth = nil)
begin
url = URI(url) unless url.is_a?(URI)
pages = []
get(url, referer) do |response, code, location, redirect_to, response_time|
pages << Page.new(location, :body => response.body.dup,
:code => code,
:headers => response.to_hash,
:referer => referer,
:depth => depth,
:redirect_to => redirect_to,
:response_time => response_time)
end
return pages
rescue => e
if verbose?
puts e.inspect
puts e.backtrace
end
return [Page.new(url, :error => e)]
end
end
def redirect_limit
@opts[:redirect_limit] || REDIRECT_LIMIT
end
def user_agent
@opts[:user_agent]
end
def accept_cookies?
@opts[:accept_cookies]
end
private
def get(url, referer = nil)
limit = redirect_limit
loc = url
begin
loc = url.merge(loc) if loc.relative?
response, response_time = get_response(loc, referer)
code = Integer(response.code)
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
yield response, code, loc, redirect_to, response_time
limit -= 1
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
end
def get_response(url, referer = nil)
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
opts = {}
opts['User-Agent'] = user_agent if user_agent
opts['Referer'] = referer.to_s if referer
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
if @opts[:http_basic_auth]
opts['Authorization'] = "Basic " + @opts[:http_basic_auth]
end
if not @opts[:inject_headers].nil?
@opts[:inject_headers].each do |hdr|
k,v = hdr.split(':', 2)
opts[k] = v
end
end
retries = 0
begin
start = Time.now()
response = nil
if @opts[:request_factory]
response = @opts[:request_factory].call(connection(url), full_path, opts)
else
response = connection(url).get(full_path, opts)
end
finish = Time.now()
response_time = ((finish - start) * 1000).round
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
return response, response_time
rescue EOFError
refresh_connection(url)
retries += 1
retry unless retries > (@opts[:retry_limit] || 3)
end
end
def connection(url)
@connections[url.host] ||= {}
if conn = @connections[url.host][url.port]
return conn
end
refresh_connection url
end
def refresh_connection(url)
http = nil
if @opts[:http_factory]
http = @opts[:http_factory].call(url)
else
http = Net::HTTP.new(url.host, url.port)
if url.scheme == 'https'
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
end
@connections[url.host][url.port] = http.start
end
def verbose?
@opts[:verbose]
end
def allowed?(to_url, from_url)
to_url.host.nil? || (to_url.host == from_url.host)
end
end
end