CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
rapid7

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: rapid7/metasploit-framework
Path: blob/master/lib/robots.rb
Views: 1903
1
#
2
# Copyright (c) 2008 Kyle Maxwell, contributors
3
#
4
# Permission is hereby granted, free of charge, to any person
5
# obtaining a copy of this software and associated documentation
6
# files (the "Software"), to deal in the Software without
7
# restriction, including without limitation the rights to use,
8
# copy, modify, merge, publish, distribute, sublicense, and/or sell
9
# copies of the Software, and to permit persons to whom the
10
# Software is furnished to do so, subject to the following
11
# conditions:
12
#
13
# The above copyright notice and this permission notice shall be
14
# included in all copies or substantial portions of the Software.
15
#
16
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
# OTHER DEALINGS IN THE SOFTWARE.
24
#
25
26
require "open-uri"
27
require "uri"
28
require "timeout"
29
30
# https://github.com/fizx/robots
31
class Robots
32
DEFAULT_TIMEOUT = 3
33
34
# Represents a parsed robots.txt file
35
class ParsedRobots
36
def initialize(uri, user_agent)
37
@last_accessed = Time.at(1)
38
39
io = Robots.get_robots_txt(uri, user_agent)
40
41
if !io || io.content_type != "text/plain" || io.status.first != "200"
42
io = StringIO.new("User-agent: *\nAllow: /\n")
43
end
44
45
@other = {}
46
@disallows = {}
47
@allows = {}
48
@delays = {} # added delays to make it work
49
agent = /.*/
50
io.each do |line|
51
next if line =~ /^\s*(#.*|$)/
52
arr = line.split(":")
53
key = arr.shift.to_s.downcase
54
value = arr.join(":").strip
55
value.strip!
56
case key
57
when "user-agent"
58
agent = to_regex(value)
59
when "allow"
60
@allows[agent] ||= []
61
@allows[agent] << to_regex(value)
62
when "disallow"
63
@disallows[agent] ||= []
64
@disallows[agent] << to_regex(value)
65
when "crawl-delay"
66
@delays[agent] = value.to_i
67
else
68
@other[key] ||= []
69
@other[key] << value
70
end
71
end
72
73
@parsed = true
74
end
75
76
def allowed?(uri, user_agent)
77
return true unless @parsed
78
allowed = true
79
path = uri.request_uri
80
81
@disallows.each do |key, value|
82
if user_agent =~ key
83
value.each do |rule|
84
allowed = false if path =~ rule
85
end
86
end
87
end
88
89
@allows.each do |key, value|
90
unless allowed
91
if user_agent =~ key
92
value.each do |rule|
93
if path =~ rule
94
allowed = true
95
end
96
end
97
end
98
end
99
end
100
101
if allowed && @delays[user_agent]
102
sleep @delays[user_agent] - (Time.now - @last_accessed)
103
@last_accessed = Time.now
104
end
105
106
return allowed
107
end
108
109
def other_values
110
@other
111
end
112
113
protected
114
115
def to_regex(pattern)
116
return /should-not-match-anything-123456789/ if pattern.strip.empty?
117
pattern = Regexp.escape(pattern)
118
pattern.gsub!(Regexp.escape("*"), ".*")
119
Regexp.compile("^#{pattern}")
120
end
121
end
122
123
def self.get_robots_txt(uri, user_agent)
124
begin
125
Timeout.timeout(Robots.timeout) do
126
begin
127
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent)
128
rescue StandardError
129
nil
130
end
131
end
132
rescue Timeout::Error
133
dlog("robots.txt request timed out")
134
end
135
end
136
137
attr_writer :timeout
138
139
def self.timeout
140
@timeout || DEFAULT_TIMEOUT
141
end
142
143
def initialize(user_agent)
144
@user_agent = user_agent
145
@parsed = {}
146
end
147
148
def allowed?(uri)
149
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
150
host = uri.host
151
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
152
@parsed[host].allowed?(uri, @user_agent)
153
end
154
155
def other_values(uri)
156
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
157
host = uri.host
158
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
159
@parsed[host].other_values
160
end
161
end
162
163