Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/lib/anemone/core.rb
Views: 11766
require 'thread'1require 'robots'2require 'anemone/tentacle'3require 'anemone/page'4require 'anemone/exceptions'5require 'anemone/page_store'6require 'anemone/storage'7require 'anemone/storage/base'89module Anemone1011VERSION = '0.5.0'1213#14# Convenience method to start a crawl15#16def Anemone.crawl(urls, options = {}, &block)17Core.crawl(urls, options, &block)18end1920class Core2122# PageStore storing all Page objects encountered during the crawl23attr_reader :pages24# Hash of options for the crawl25attr_reader :opts2627DEFAULT_OPTS = {28# run 4 Tentacle threads to fetch pages29:threads => 4,30# disable verbose output31:verbose => false,32# don't throw away the page response body after scanning it for links33:discard_page_bodies => false,34# identify self as Anemone/VERSION35:user_agent => "Anemone/#{Anemone::VERSION}",36# no delay between requests37:delay => 0,38# don't obey the robots exclusion protocol39:obey_robots_txt => false,40# by default, don't limit the depth of the crawl41:depth_limit => false,42# number of times HTTP redirects will be followed43:redirect_limit => 5,44# storage engine defaults to Hash in +process_options+ if none specified45:storage => nil,46# hash of cookie name => value to send with HTTP requests47:cookies => nil,48# basic authentication data to send with HTTP requests49:http_basic_auth => nil,50# array or raw header lines to inject into each request51:inject_headers => [],52# accept cookies from the server and send them back?53:accept_cookies => false,54# skip any link with a query string? e.g. http://foo.com/?u=user55:skip_query_strings => false,56:dirbust => true57}5859# Create setter methods for all options to be called from the crawl block60DEFAULT_OPTS.keys.each do |key|61define_method "#{key}=" do |value|62@opts[key.to_sym] = value63end64end6566#67# Initialize the crawl with starting *urls* (single URL or Array of URLs)68# and optional *block*69#70def initialize(urls, opts = {})71@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }72@urls.each{ |url| url.path = '/' if url.path.empty? }7374@tentacles = []75@on_every_page_blocks = []76@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }77@skip_link_patterns = []78@after_crawl_blocks = []79@opts = opts8081yield self if block_given?82end8384#85# Convenience method to start a new crawl86#87def self.crawl(urls, opts = {})88self.new(urls, opts) do |core|89yield core if block_given?90core.run91end92end9394#95# Add a block to be executed on the PageStore after the crawl96# is finished97#98def after_crawl(&block)99@after_crawl_blocks << block100self101end102103#104# Add one or more Regex patterns for URLs which should not be105# followed106#107def skip_links_like(*patterns)108@skip_link_patterns.concat [patterns].flatten.compact109self110end111112#113# Add a block to be executed on every Page as they are encountered114# during the crawl115#116def on_every_page(&block)117@on_every_page_blocks << block118self119end120121#122# Add a block to be executed on Page objects with a URL matching123# one or more patterns124#125def on_pages_like(*patterns, &block)126if patterns127patterns.each do |pattern|128@on_pages_like_blocks[pattern] << block129end130end131self132end133134#135# Specify a block which will select which links to follow on each page.136# The block should return an Array of URI objects.137#138def focus_crawl(&block)139@focus_crawl_block = block140self141end142143#144# Perform the crawl145#146def run147process_options148149@urls.delete_if { |url| !visit_link?(url) }150return if @urls.empty?151152link_queue = Queue.new153page_queue = Queue.new154155@opts[:threads].times do156@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }157end158159@urls.each{ |url| link_queue.enq(url) }160161loop do162page = page_queue.deq163@pages.touch_key page.url164puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]165do_page_blocks page166page.discard_doc! if @opts[:discard_page_bodies]167168links = links_to_follow page169links.each do |link|170link_queue << [link, page.url.dup, page.depth + 1]171end172@pages.touch_keys links173174@pages[page.url] = page175176# if we are done with the crawl, tell the threads to end177if link_queue.empty? and page_queue.empty?178until link_queue.num_waiting == @tentacles.size179Thread.pass180end181if page_queue.empty?182@tentacles.size.times { link_queue << :END }183break184end185end186end187188@tentacles.each { |thread| thread.join }189do_after_crawl_blocks190self191end192193private194195def process_options196@opts = DEFAULT_OPTS.merge @opts197@opts[:threads] = 1 if @opts[:delay] > 0198storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)199@pages = PageStore.new(storage)200@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]201202# freeze_options203end204205#206# Freeze the opts Hash so that no options can be modified207# once the crawl begins208#209def freeze_options210@opts.freeze211@opts.each_key { |key| @opts[key].freeze }212@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil213end214215#216# Execute the after_crawl blocks217#218def do_after_crawl_blocks219@after_crawl_blocks.each { |block| block.call(@pages) }220end221222#223# Execute the on_every_page blocks for *page*224#225def do_page_blocks(page)226@on_every_page_blocks.each do |block|227block.call(page)228end229230@on_pages_like_blocks.each do |pattern, blocks|231blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern232end233end234235#236# Return an Array of links to follow from the given page.237# Based on whether or not the link has already been crawled,238# and the block given to focus_crawl()239#240def links_to_follow(page)241links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links242links.select { |link| visit_link?(link, page) }.map { |link| link.dup }243end244245#246# Returns +true+ if *link* has not been visited already,247# and is not excluded by a skip_link pattern...248# and is not excluded by robots.txt...249# and is not deeper than the depth limit250# Returns +false+ otherwise.251#252def visit_link?(link, from_page = nil)253!@pages.has_page?(link) &&254!skip_link?(link) &&255!skip_query_string?(link) &&256allowed(link) &&257!too_deep?(from_page)258end259260#261# Returns +true+ if we are obeying robots.txt and the link262# is granted access in it. Always returns +true+ when we are263# not obeying robots.txt.264#265def allowed(link)266@opts[:obey_robots_txt] ? @robots.allowed?(link) : true267end268269#270# Returns +true+ if we are over the page depth limit.271# This only works when coming from a page and with the +depth_limit+ option set.272# When neither is the case, will always return +false+.273def too_deep?(from_page)274if from_page && @opts[:depth_limit]275from_page.depth >= @opts[:depth_limit]276else277false278end279end280281#282# Returns +true+ if *link* should not be visited because283# it has a query string and +skip_query_strings+ is true.284#285def skip_query_string?(link)286@opts[:skip_query_strings] && link.query287end288289#290# Returns +true+ if *link* should not be visited because291# its URL matches a skip_link pattern.292#293def skip_link?(link)294@skip_link_patterns.any? { |pattern| link.path =~ pattern }295end296297#298# Kills all active threads299#300def shutdown301@tentacles.each {|t| t.kill rescue nil }302@pages = nil303end304305end306end307308309