2003-03-28 22:08:50 +01:00
|
|
|
-----------------------------------------------------------------------------
|
|
|
|
-- Little program that checks links in HTML files
|
2003-06-26 20:47:49 +02:00
|
|
|
-- LuaSocket sample files
|
2003-06-09 20:23:40 +02:00
|
|
|
-- Author: Diego Nehab
|
|
|
|
-- RCS ID: $Id$
|
2003-03-28 22:08:50 +01:00
|
|
|
-----------------------------------------------------------------------------
|
2004-06-04 17:15:45 +02:00
|
|
|
local http = require("http")
|
|
|
|
local url = require("url")
|
|
|
|
http.TIMEOUT = 10
|
2001-09-27 22:02:58 +02:00
|
|
|
|
|
|
|
cache = {}
|
|
|
|
|
|
|
|
function readfile(path)
|
2004-06-04 17:15:45 +02:00
|
|
|
path = url.unescape(path)
|
2004-03-26 01:18:41 +01:00
|
|
|
local file, error = io.open(path, "r")
|
2001-09-27 22:02:58 +02:00
|
|
|
if file then
|
2004-03-26 01:18:41 +01:00
|
|
|
local body = file:read("*a")
|
|
|
|
file:close()
|
2001-09-27 22:02:58 +02:00
|
|
|
return body
|
|
|
|
else return nil, error end
|
|
|
|
end
|
|
|
|
|
2004-06-04 17:15:45 +02:00
|
|
|
function getstatus(u)
|
|
|
|
local parsed = url.parse(u, {scheme = "file"})
|
|
|
|
if cache[u] then return cache[u] end
|
2001-09-27 22:02:58 +02:00
|
|
|
local res
|
|
|
|
if parsed.scheme == "http" then
|
2004-06-04 17:15:45 +02:00
|
|
|
local request = {url = u, method = "HEAD"}
|
|
|
|
local response = http.request(request)
|
2001-09-27 22:02:58 +02:00
|
|
|
if response.code == 200 then res = nil
|
|
|
|
else res = response.status or response.error end
|
|
|
|
elseif parsed.scheme == "file" then
|
2004-06-04 17:15:45 +02:00
|
|
|
local file, error = io.open(url.unescape(parsed.path), "r")
|
2001-09-27 22:02:58 +02:00
|
|
|
if file then
|
2004-03-26 01:18:41 +01:00
|
|
|
file:close()
|
2001-09-27 22:02:58 +02:00
|
|
|
res = nil
|
|
|
|
else res = error end
|
2003-03-20 01:24:44 +01:00
|
|
|
else res = string.format("unhandled scheme '%s'", parsed.scheme) end
|
2004-06-04 17:15:45 +02:00
|
|
|
cache[u] = res
|
2001-09-27 22:02:58 +02:00
|
|
|
return res
|
|
|
|
end
|
|
|
|
|
2004-06-04 17:15:45 +02:00
|
|
|
function retrieve(u)
|
|
|
|
local parsed = url.parse(u, { scheme = "file" })
|
2004-03-26 01:18:41 +01:00
|
|
|
local body, headers, code, error
|
2004-06-04 17:15:45 +02:00
|
|
|
local base = u
|
2001-09-27 22:02:58 +02:00
|
|
|
if parsed.scheme == "http" then
|
2004-06-04 17:15:45 +02:00
|
|
|
body, headers, code, error = http.get(u)
|
2004-03-26 01:18:41 +01:00
|
|
|
if code == 200 then
|
|
|
|
base = base or headers.location
|
2001-09-27 22:02:58 +02:00
|
|
|
end
|
|
|
|
elseif parsed.scheme == "file" then
|
|
|
|
body, error = readfile(parsed.path)
|
2003-03-20 01:24:44 +01:00
|
|
|
else error = string.format("unhandled scheme '%s'", parsed.scheme) end
|
2001-09-27 22:02:58 +02:00
|
|
|
return base, body, error
|
|
|
|
end
|
|
|
|
|
|
|
|
function getlinks(body, base)
|
|
|
|
-- get rid of comments
|
2003-03-20 01:24:44 +01:00
|
|
|
body = string.gsub(body, "%<%!%-%-.-%-%-%>", "")
|
2001-09-27 22:02:58 +02:00
|
|
|
local links = {}
|
|
|
|
-- extract links
|
2004-03-26 01:18:41 +01:00
|
|
|
body = string.gsub(body, '[Hh][Rr][Ee][Ff]%s*=%s*"([^"]*)"', function(href)
|
2004-06-04 17:15:45 +02:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 22:02:58 +02:00
|
|
|
end)
|
2004-03-26 01:18:41 +01:00
|
|
|
body = string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*'([^']*)'", function(href)
|
2004-06-04 17:15:45 +02:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 22:02:58 +02:00
|
|
|
end)
|
2004-03-26 01:18:41 +01:00
|
|
|
string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*(.-)>", function(href)
|
2004-06-04 17:15:45 +02:00
|
|
|
table.insert(links, url.absolute(base, href))
|
2001-09-27 22:02:58 +02:00
|
|
|
end)
|
|
|
|
return links
|
|
|
|
end
|
|
|
|
|
2004-06-04 17:15:45 +02:00
|
|
|
function checklinks(u)
|
|
|
|
local base, body, error = retrieve(u)
|
2001-09-27 22:02:58 +02:00
|
|
|
if not body then print(error) return end
|
|
|
|
local links = getlinks(body, base)
|
2003-03-20 01:24:44 +01:00
|
|
|
for _, l in ipairs(links) do
|
|
|
|
io.stderr:write("\t", l, "\n")
|
|
|
|
local err = getstatus(l)
|
|
|
|
if err then io.stderr:write('\t', l, ": ", err, "\n") end
|
2001-09-27 22:02:58 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
arg = arg or {}
|
2003-03-20 01:24:44 +01:00
|
|
|
if table.getn(arg) < 1 then
|
|
|
|
print("Usage:\n luasocket check-links.lua {<url>}")
|
2001-09-27 22:02:58 +02:00
|
|
|
exit()
|
|
|
|
end
|
2003-03-20 01:24:44 +01:00
|
|
|
for _, a in ipairs(arg) do
|
|
|
|
print("Checking ", a)
|
2004-06-04 17:15:45 +02:00
|
|
|
checklinks(url.absolute("file:", a))
|
2001-09-27 22:02:58 +02:00
|
|
|
end
|