require 'thread'
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/exceptions'
require 'anemone/page_store'
require 'anemone/storage'
require 'anemone/storage/base'
module Anemone
VERSION = '0.5.0'
def Anemone.crawl(urls, options = {}, &block)
Core.crawl(urls, options, &block)
end
class Core
attr_reader :pages
attr_reader :opts
DEFAULT_OPTS = {
:threads => 4,
:verbose => false,
:discard_page_bodies => false,
:user_agent => "Anemone/#{Anemone::VERSION}",
:delay => 0,
:obey_robots_txt => false,
:depth_limit => false,
:redirect_limit => 5,
:storage => nil,
:cookies => nil,
:http_basic_auth => nil,
:inject_headers => [],
:accept_cookies => false,
:skip_query_strings => false,
:dirbust => true
}
DEFAULT_OPTS.keys.each do |key|
define_method "#{key}=" do |value|
@opts[key.to_sym] = value
end
end
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@tentacles = []
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
@opts = opts
yield self if block_given?
end
def self.crawl(urls, opts = {})
self.new(urls, opts) do |core|
yield core if block_given?
core.run
end
end
def after_crawl(&block)
@after_crawl_blocks << block
self
end
def skip_links_like(*patterns)
@skip_link_patterns.concat [patterns].flatten.compact
self
end
def on_every_page(&block)
@on_every_page_blocks << block
self
end
def on_pages_like(*patterns, &block)
if patterns
patterns.each do |pattern|
@on_pages_like_blocks[pattern] << block
end
end
self
end
def focus_crawl(&block)
@focus_crawl_block = block
self
end
def run
process_options
@urls.delete_if { |url| !visit_link?(url) }
return if @urls.empty?
link_queue = Queue.new
page_queue = Queue.new
@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
@urls.each{ |url| link_queue.enq(url) }
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]
links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
@pages[page.url] = page
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
end
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
break
end
end
end
@tentacles.each { |thread| thread.join }
do_after_crawl_blocks
self
end
private
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
end
def freeze_options
@opts.freeze
@opts.each_key { |key| @opts[key].freeze }
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
end
def do_after_crawl_blocks
@after_crawl_blocks.each { |block| block.call(@pages) }
end
def do_page_blocks(page)
@on_every_page_blocks.each do |block|
block.call(page)
end
@on_pages_like_blocks.each do |pattern, blocks|
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
end
end
def links_to_follow(page)
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
end
def visit_link?(link, from_page = nil)
!@pages.has_page?(link) &&
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
end
def allowed(link)
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
end
def too_deep?(from_page)
if from_page && @opts[:depth_limit]
from_page.depth >= @opts[:depth_limit]
else
false
end
end
def skip_query_string?(link)
@opts[:skip_query_strings] && link.query
end
def skip_link?(link)
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
end
def shutdown
@tentacles.each {|t| t.kill rescue nil }
@pages = nil
end
end
end