DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Regex-based Parsing Technique
#!/usr/local/bin/ruby -w
# Regular expressions and strings with embedded objects
# From: http://t-a-w.blogspot.com/2007/06/regular-expressions-and-strings-with.html
# Author: Tomasz Węgrzanowski
# License:
# Creative Commons License, http://creativecommons.org/licenses/by-sa/3.0/
# GNU Free Documentation License, http://en.wikipedia.org/wiki/GNU_Free_Documentation_License
def hash_or_die(kw)
Hash.new{|ht,k| raise "Unknown key: #{k}"}.merge(kw)
end
def parse(data)
esc = hash_or_die "\\" => "A", "\"" => "B", "n" => "C", "'" => "D"
rev_esc = hash_or_die "A" => "\\", 'B' => "\"", "C" => "n", "D" => "'"
data = data.gsub(/\\(.)/) {"\x00" + esc[$1]}
strs = []
data = data.gsub(/('[^']*')/) { # '
strs << $1
"\x01<#{strs.size-1}>"
}
records = []
data.scan(/\((.*?)\)/) {
records << $1.split(/,/).map{|field|
field.gsub(/\x01<(\d+)>/) {
strs[$1.to_i]}.gsub(/\x00(.)/){ rev_esc[$1]
}
}
}
records
end
def sql_str_unquote(str)
str =~ /\A'(.*)'\Z/ or raise "SQL string format is wrong: #{str}"
$1.gsub(/\\(.)/) {$1}
end
=begin
page_fn = Dir["plwiki-*-page.sql"].sort[-1]
externallinks_fn = Dir["plwiki-*-externallinks.sql"].sort[-1]
pages = {}
File.open(page_fn).each{|line|
next unless line =~ /\AINSERT INTO `page` VALUES (.*)\Z/
parse($1).each{|id,ns,title,*stuff|
next unless ns == "0"
title = sql_str_unquote(title)
pages[id] = title
}
}
File.open(externallinks_fn).each{|line|
next unless line =~ /\AINSERT INTO `externallinks` VALUES (.*)\Z/
parse($1).each{|from,to,index|
title = pages[from]
next unless title
to = sql_str_unquote(to)
next unless to =~ /\Ahttp:\/\//
puts "#{title}\t#{to}"
}
}
=end
sql_dump = <<-EOS
INSERT INTO `page` VALUES (1,0,'Astronomia','',1800,0,0,0.600461925007833,'20070601091320',8076762,8584,0), (2,0,'AWK','',329,0,0,0.487812640599732,'20070530195555',8058046,4265,0), (4,0,'Alergologia','',108,0,0,0.580574716050713,'20070520093413',7912844,292,0), ...
INSERT INTO `page` VALUES (14880,0,'Dźwignica_linotorowa','',26,0,0,0.597327036408081,'20060814072401',4282357,727,0), (14881,0,'Urządzenia_transportowe','',91,0,0,0.176666489966834,'20070527090143',2976610,1041,0), ...
EOS
pages = {}
sql_dump.each{|line|
next unless line =~ /\AINSERT INTO `page` VALUES (.*)\Z/
parse($1).each{|id,ns,title,*stuff|
next unless ns == "0"
title = sql_str_unquote(title)
pages[id] = title
}
}
p pages
=begin
sql_dump.each{|line|
next unless line =~ /\AINSERT INTO `externallinks` VALUES (.*)\Z/
parse($1).each{|from,to,index|
title = pages[from]
next unless title
to = sql_str_unquote(to)
next unless to =~ /\Ahttp:\/\//
puts "#{title}\t#{to}"
}
}
=end
#-----------------
require 'pp'
lisp_code = '(a (b c) (d (e) f g) (((h))))'
nodes = []
lisp_code.gsub!(/([a-z]+)/) {
nodes << [:atom, $1]
"<#{nodes.size-1}>"
}
#p nodes
lisp_code.gsub!(/\s/,"")
#puts lisp_code
true while lisp_code.gsub!(/\(((?:<\d+>)*)\)/) {
#p nodes
nodes << [:app, *$1.scan(/<(\d+)>/).map{|x,| nodes[x.to_i]}]
"<#{nodes.size-1}>"
}
lisp_code =~ /<(\d+)>/
#puts
#p nodes
#puts
pp nodes[$1.to_i]
# Output:
# [:app,
# [:atom, "a"],
# [:app, [:atom, "b"], [:atom, "c"]],
# [:app, [:atom, "d"], [:app, [:atom, "e"]], [:atom, "f"], [:atom, "g"]],
# [:app, [:app, [:app, [:atom, "h"]]]]]
#------------------
math_code = '(2 + 2 * 2) / ((2 + 2) * 2)'
nodes = []
math_code.gsub!(/(\d+)/) {
nodes << $1.to_i
"<#{nodes.size-1}>"
}
math_code.gsub!(/\s/,"")
until math_code =~ /\A<(\d+)>\Z/
next if math_code.gsub!(/\((<\d+>)\)/) { $1 }
next if math_code.gsub!(/<(\d+)>([\*\/])<(\d+)>/) {
nodes << [$2, nodes[$1.to_i], nodes[$3.to_i]]
"<#{nodes.size-1}>"
}
next if math_code.gsub!(/<(\d+)>([\+\-])<(\d+)>/) {
nodes << [$2, nodes[$1.to_i], nodes[$3.to_i]]
"<#{nodes.size-1}>"
}
end
pp nodes[$1.to_i]
# Output:
# ["/", ["+", 2, ["*", 2, 2]], ["*", ["+", 2, 2], 2]]




