Path: blob/master/lib/anemone/extractors/generic.rb
19592 views
require 'uri'12class Anemone::Extractors::Generic < Anemone::Extractors::Base34def run5URI.extract( doc.to_s, %w(http https) ).map do |u|6#7# This extractor needs to be a tiny bit intelligent because8# due to its generic nature it'll inevitably match some garbage.9#10# For example, if some JS code contains:11#12# var = 'http://blah.com?id=1'13#14# or15#16# var = { 'http://blah.com?id=1', 1 }17#18#19# The URI.extract call will match:20#21# http://blah.com?id=1'22#23# and24#25# http://blah.com?id=1',26#27# respectively.28#29if !includes_quotes?( u )30u31else32if html.include?( "'#{u}" )33u.split( '\'' ).first34elsif html.include?( "\"#{u}" )35u.split( '"' ).first36else37u38end39end40end41rescue42[]43end4445def includes_quotes?( url )46url.include?( '\'' ) || url.include?( '"' )47end4849end505152