Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/lib/anemone/page.rb
Views: 11766
require 'nokogiri'1require 'ostruct'2require 'webrick/cookie'34module Anemone56# Path extractor container namespace.7module Extractors8class Base9attr_reader :page1011def initialize( page )12@page = page13end1415def doc16page.doc17end18end19end2021class Page2223# The URL of the page24attr_reader :url25# The raw HTTP response body of the page26attr_reader :body27# Headers of the HTTP response28attr_reader :headers29# URL of the page this one redirected to, if any30attr_reader :redirect_to31# Exception object, if one was raised during HTTP#fetch_page32attr_reader :error3334# OpenStruct for user-stored data35attr_accessor :data36# Integer response code of the page37attr_accessor :code38# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!39attr_accessor :visited40# Depth of this page from the root of the crawl. This is not necessarily the41# shortest path; use PageStore#shortest_paths! to find that value.42attr_accessor :depth43# URL of the page that brought us to this page44attr_accessor :referer45# Response time of the request for this page in milliseconds46attr_accessor :response_time47# Storage for the original HTTP request that generated this response48attr_accessor :request4950#51# Create a new page52#53def initialize(url, params = {})54@url = url55@data = OpenStruct.new5657@dirbust = params[:dirbust]58@code = params[:code]59@headers = params[:headers] || {}60@headers['content-type'] ||= ['']61@aliases = Array(params[:aka]).compact62@referer = params[:referer]63@depth = params[:depth] || 064@redirect_to = to_absolute(params[:redirect_to])65@response_time = params[:response_time]66@body = params[:body]67@error = params[:error]6869@fetched = !params[:code].nil?70end7172def self.extractors73return @extractors if @extractors7475lib = File.dirname( __FILE__ ) + '/extractors/*.rb'76Dir.glob( lib ).each { |e| require e }7778@extractors = Extractors.constants.map do |e|79next if e == :Base80Extractors.const_get( e )81end.compact82end8384def run_extractors85return [] if !doc86self.class.extractors.map do |e|87next if e == Extractors::Dirbuster && !dirbust?88e.new( self ).run rescue next89end.flatten.90compact.map do |p|91abs = to_absolute( URI( p ) ) rescue next92!in_domain?( abs ) ? nil : abs93end.compact.uniq94end9596#97# Array of distinct A tag HREFs from the page98#99# MODIFIED: Dig URLs from elements other than "A" refs100#101def links102@links ||= run_extractors103end104105#106# Nokogiri document for the HTML body107#108def doc109return @doc if @doc110@doc = Nokogiri::HTML(@body) if @body && html? rescue nil111end112113#114# Delete the Nokogiri document and response body to conserve memory115#116def discard_doc!117links # force parsing of page links before we trash the document118@doc = @body = nil119end120121#122# Was the page successfully fetched?123# +true+ if the page was fetched with no error, +false+ otherwise.124#125def fetched?126@fetched127end128129#130# Array of cookies received with this page as WEBrick::Cookie objects.131#132def cookies133WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []134end135136#137# The content-type returned by the HTTP request for this page138#139def content_type140res = headers['content-type']141res = res.first if res.kind_of?(::Array)142res143end144145#146# Returns +true+ if the page is a HTML document, returns +false+147# otherwise.148#149def html?150!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})151end152153#154# Returns +true+ if the page is a HTTP redirect, returns +false+155# otherwise.156#157def redirect?158(300..307).include?(@code)159end160161#162# Returns +true+ if the page was not found (returned 404 code),163# returns +false+ otherwise.164#165def not_found?166404 == @code167end168169#170# Converts relative URL *link* into an absolute URL based on the171# location of the page172#173def to_absolute(link)174return nil if link.nil?175176# remove anchor177link = URI::DEFAULT_PARSER.escape(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))178179relative = URI(link)180absolute = @url.merge(relative)181182absolute.path = '/' if absolute.path.empty?183184return absolute185end186187def dirbust?188@dirbust189end190191#192# Returns +true+ if *uri* is in the same domain as the page, returns193# +false+ otherwise194#195def in_domain?(uri)196uri.host == @url.host197end198199def marshal_dump200[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]201end202203def marshal_load(ary)204@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary205end206207def to_hash208{'url' => @url.to_s,209'headers' => Marshal.dump(@headers),210'data' => Marshal.dump(@data),211'body' => @body,212'links' => links.map(&:to_s),213'code' => @code,214'visited' => @visited,215'depth' => @depth,216'referer' => @referer.to_s,217'redirect_to' => @redirect_to.to_s,218'response_time' => @response_time,219'fetched' => @fetched}220end221222def self.from_hash(hash)223page = self.new(URI(hash['url']))224{'@headers' => Marshal.load(hash['headers']),225'@data' => Marshal.load(hash['data']),226'@body' => hash['body'],227'@links' => hash['links'].map { |link| URI(link) },228'@code' => hash['code'].to_i,229'@visited' => hash['visited'],230'@depth' => hash['depth'].to_i,231'@referer' => hash['referer'],232'@redirect_to' => URI(hash['redirect_to']),233'@response_time' => hash['response_time'].to_i,234'@fetched' => hash['fetched']235}.each do |var, value|236page.instance_variable_set(var, value)237end238page239end240241def dup242Marshal.load( Marshal.dump( self ) )243end244245end246end247248249