Module:HtmlParser
Jump to navigation
Jump to search
Documentation for this module may be created at Module:HtmlParser/doc
local p = {}
-- LUA HTML parser
-- From: https://gist.github.com/exebetche/6126573
local empty_tags = {
br = true,
hr = true,
img = true,
embed = true,
param = true,
area = true,
col = true,
input = true,
meta = true,
link = true,
base = true,
basefont = true,
frame = true,
isindex = true
}
-- omittable tags siblings
-- if an open tag from the primary entry follow
-- an unclosed tag of the secondary,
-- the secondary is automatically closed
-- See http://www.w3.org/TR/html5/syntax.html#optional-tags
local omittable_tags = {
tbody = {
thead = true,
tbody = true,
tfoot = true
},
thead = {
thead = true,
tbody = true,
tfoot = true
},
tfoot = {
thead = true,
tbody = true,
tfoot = true
},
td = {
td = true,
th = true
},
th = {
td = true,
th = true
},
tr = {
tr = true
},
dd = {
dd = true,
dt = true
},
dt = {
dd = true,
dt = true
},
optgroup = {
optgroup = true,
option = true
},
optgroup = {
optgroup = true,
option = true
},
address = { p = true},
article = { p = true},
aside = { p = true},
blockquote = { p = true},
dir = { p = true},
div = { p = true},
dl = { p = true},
fieldset = { p = true},
footer = { p = true},
form = { p = true},
h1 = { p = true},
h2 = { p = true},
h3 = { p = true},
h4 = { p = true},
h5 = { p = true},
h6 = { p = true},
header = { p = true},
hgroup = { p = true},
hr = { p = true},
menu = { p = true},
nav = { p = true},
ol = { p = true},
p = { p = true},
pre = { p = true},
section = { p = true},
table = { p = true},
ul= { p = true}
}
-- omittable tags children
local omittable_tags2 = {
table = {
tr = true,
td = true,
p = true,
},
tr = {
td = true,
p = true
},
td = {
p = true
}
}
function p.parse_html(data, lazy)
local tree = {}
local stack = {}
local level = 0
local new_level = 0
table.insert(stack, tree)
local node
local lower_tag
local script_open = false
local script_val = ""
local script_node = nil
local tag_match = ""
lazy = lazy or false
for b, op, tag, attr, op2, bl1, val, bl2 in string.gmatch(
data,
"(<)(%/?!?)([%w:_-'\\\"%[]+)(.-)(%/?%-?)>"..
"([%s\r\n\t]*)([^<]*)([%s\r\n\t]*)"
) do
lower_tag = string.lower(tag)
if script_open then
if lower_tag == "script" and op == "/" then
node.childNodes[1].value = string.gsub(script_val, "^<!%[CDATA%[", "<!--//%1")
if val ~= "" then
table.insert(stack[level], {
tagName = "textNode",
value = val
})
end
level = level - 1
script_open = false
else
script_val = script_val..b..op..tag..attr..op2..bl1..val..bl2
end
elseif op == "!" then
elseif op == "/" then
-- Check if the previous children elements end tag have been omitted
-- and should be close automatically
while not lazy
and omittable_tags2[lower_tag]
and #stack[level] > 0
and omittable_tags2[lower_tag][stack[level][#stack[level]].tagName]
do
print("Auto closing "..
stack[level][#stack[level]].tagName..
" followed by ending "..lower_tag)
level = level - 1
table.remove(stack)
end
if level==0 then return tree end
if lower_tag ~= stack[level][#stack[level]].tagName
then
print("Mismatch: "..lower_tag..
", (has "..stack[level][#stack[level]].tagName..")")
end
level = level - 1
table.remove(stack)
else
level = level + 1
node = nil
node = {}
node.tagName = lower_tag
node.childNodes = {}
if attr ~= "" then
node.attr = {}
for n, v in string.gmatch(
attr,
"%s([^%s=]+)=\"([^\"]+)\""
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
for n, v in string.gmatch(
attr,
"%s([^%s=]+)='([^']+)'"
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
end
if lower_tag == "script"
and node.attr
and not node.attr["src"]
then
script_val = bl1..val..bl2
table.insert(node.childNodes, {
tagName = "textNode",
value = ""
})
table.insert(stack[level], node)
script_open = true
else
-- Check if the previous sibling element end tag has been omitted
-- and should be close automatically
if not lazy
and omittable_tags[lower_tag]
and level > 1
and stack[level-1]
and #stack[level-1] > 0
and omittable_tags[lower_tag][stack[level-1][#stack[level-1]].tagName] == true
then
print("Auto closing "..
stack[level-1][#stack[level-1]].tagName..
" followed by "..lower_tag)
level = level - 1
table.remove(stack)
if level==0 then return tree end
end
table.insert(stack[level], node)
if empty_tags[lower_tag] then
if val ~= "" then
table.insert(stack[level], {
tagName = "textNode",
value = val
})
end
node.childNodes = nil
level = level - 1
else
if val ~= "" then
table.insert(node.childNodes, {
tagName = "textNode",
value = val
})
end
table.insert(stack, node.childNodes)
end
end
end
end
if level~=0 then
vlc.msg.dbg("Parse error: "..level)
end
--collectgarbage()
return tree
end
function p.dump_html(data)
local stack = {data}
local d = ""
local node = nil
while #stack ~= 0 do
node = nil
node = stack[#stack][1]
if not node then break end
if node.tagName == "textNode" then
d = d..mw.text.trim(node.value)
else
d = d.."\n"..string.rep (" ", #stack-1)
d = d.."<"..node.tagName
if node.attr then
for a, v in pairs(node.attr) do
d = d.." "..a..'="'..v..'"'
end
end
if empty_tags[node.tagName] then
d = d.."/>"
else
d = d..">"
end
end
if node.childNodes and #node.childNodes > 0 then
node.l = #node.childNodes
table.insert(stack, node.childNodes)
else
table.remove(stack[#stack], 1)
if node.childNodes and #node.childNodes == 0 and not empty_tags[node.tagName] then
d = d.."</"..node.tagName..">"
end
while #stack > 0 and #stack[#stack] == 0 do
table.remove(stack)
if #stack > 0 then
if stack[#stack][1].l > 1 then
d = d.."\n"..string.rep(" ", #stack-1).."</"..stack[#stack][1].tagName..">"
else
d = d.."</"..stack[#stack][1].tagName..">"
end
table.remove(stack[#stack], 1)
end
end
end
end
return d
end
return p