Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/lib/robots.rb
Views: 11704
#1# Copyright (c) 2008 Kyle Maxwell, contributors2#3# Permission is hereby granted, free of charge, to any person4# obtaining a copy of this software and associated documentation5# files (the "Software"), to deal in the Software without6# restriction, including without limitation the rights to use,7# copy, modify, merge, publish, distribute, sublicense, and/or sell8# copies of the Software, and to permit persons to whom the9# Software is furnished to do so, subject to the following10# conditions:11#12# The above copyright notice and this permission notice shall be13# included in all copies or substantial portions of the Software.14#15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,16# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES17# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND18# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT19# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,20# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR22# OTHER DEALINGS IN THE SOFTWARE.23#2425require "open-uri"26require "uri"27require "timeout"2829# https://github.com/fizx/robots30class Robots31DEFAULT_TIMEOUT = 33233# Represents a parsed robots.txt file34class ParsedRobots35def initialize(uri, user_agent)36@last_accessed = Time.at(1)3738io = Robots.get_robots_txt(uri, user_agent)3940if !io || io.content_type != "text/plain" || io.status.first != "200"41io = StringIO.new("User-agent: *\nAllow: /\n")42end4344@other = {}45@disallows = {}46@allows = {}47@delays = {} # added delays to make it work48agent = /.*/49io.each do |line|50next if line =~ /^\s*(#.*|$)/51arr = line.split(":")52key = arr.shift.to_s.downcase53value = arr.join(":").strip54value.strip!55case key56when "user-agent"57agent = to_regex(value)58when "allow"59@allows[agent] ||= []60@allows[agent] << to_regex(value)61when "disallow"62@disallows[agent] ||= []63@disallows[agent] << to_regex(value)64when "crawl-delay"65@delays[agent] = value.to_i66else67@other[key] ||= []68@other[key] << value69end70end7172@parsed = true73end7475def allowed?(uri, user_agent)76return true unless @parsed77allowed = true78path = uri.request_uri7980@disallows.each do |key, value|81if user_agent =~ key82value.each do |rule|83allowed = false if path =~ rule84end85end86end8788@allows.each do |key, value|89unless allowed90if user_agent =~ key91value.each do |rule|92if path =~ rule93allowed = true94end95end96end97end98end99100if allowed && @delays[user_agent]101sleep @delays[user_agent] - (Time.now - @last_accessed)102@last_accessed = Time.now103end104105return allowed106end107108def other_values109@other110end111112protected113114def to_regex(pattern)115return /should-not-match-anything-123456789/ if pattern.strip.empty?116pattern = Regexp.escape(pattern)117pattern.gsub!(Regexp.escape("*"), ".*")118Regexp.compile("^#{pattern}")119end120end121122def self.get_robots_txt(uri, user_agent)123begin124Timeout.timeout(Robots.timeout) do125begin126URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent)127rescue StandardError128nil129end130end131rescue Timeout::Error132dlog("robots.txt request timed out")133end134end135136attr_writer :timeout137138def self.timeout139@timeout || DEFAULT_TIMEOUT140end141142def initialize(user_agent)143@user_agent = user_agent144@parsed = {}145end146147def allowed?(uri)148uri = URI.parse(uri.to_s) unless uri.is_a?(URI)149host = uri.host150@parsed[host] ||= ParsedRobots.new(uri, @user_agent)151@parsed[host].allowed?(uri, @user_agent)152end153154def other_values(uri)155uri = URI.parse(uri.to_s) unless uri.is_a?(URI)156host = uri.host157@parsed[host] ||= ParsedRobots.new(uri, @user_agent)158@parsed[host].other_values159end160end161162163