#!/usr/bin/env ruby # This little piece of software is free in every sense of the word. # Mon, 19 Apr 2004, Jan Weil if ARGV.include?("-h") || ARGV.include?("--help") || ARGV.size == 0 puts "usage: #{File.basename($0)} [-H] URL..." puts "-H --header\tadd csv header" exit end if ARGV.include?("-H") || ARGV.include?("--header") $print_header = true ARGV.delete("-H") ARGV.delete("--header") end def extract_urls(str) urls = [] url_regex = /\[(\d+)\](\S.+)/ loop do if str =~ url_regex urls.push($reference[$1.to_i]) str.sub!(url_regex){|s| $2} else break end end if not urls.empty? return urls.join(" ") else return false end end def push_li(line, level, regex) next_line = "" loop do next_line = $lines.pop if next_line =~ regex line += " #{$1}" else break end end $lines.push(next_line) urls = extract_urls(line) $data.push({"text" => line, "urls" => urls, "cat" => $cat[level] || "None"}) if urls $cat[level+1] = line end ARGV.each do |url| $reference = [] $cat = [] $data = [] # XXX this works, at least for linux-sound.org url =~ /(\w+\.\w+)$/ loc = $1 or raise("Help me at XXX!") `wget #{url}` if $? != 0 exit 1 end tmp = loc + ".dump" # unset locales (we need ^References$) ENV["LANG"] = "C" `lynx -dump #{loc} > #{tmp}` if $? != 0 STDERR << "calling lynx failed! Is it installed?\n" exit 1 end # extract link list (legend) out = `sed -n '/^References$/,$p' #{tmp} | sed -n '3,$p'`.split(/$/) if $? != 0 STDERR << "calling sed failed! Is it installed?\n" exit 1 end out.each do |line| ary = line.split $reference[ary[0].to_i] = ary[1] end # extract data $lines = `sed -n '1,/^References$/p' #{tmp}`.split(/$/) File.delete(tmp) # we need a stack $lines.reverse! # traverse all lines loop do line = $lines.pop break if not line # title if line =~ /^ (\S.*)$/ $cat[1] = $1 next end # li level 1 if line =~ / \* (\S.*)$/ line = $1 push_li(line, 1, /^ (\S.*)$/) next end # li level 2 if line =~ / \+ (\S.*)$/ line = $1 push_li(line, 2, /^ (\S.*)$/) next end # li level 3 if line =~ /^ o (\S.*)$/ line = $1 push_li(line, 3, /^ (\S.*)$/) next end # li level 4 if line =~ /^ # (\S.*)$/ line = $1 push_li(line, 4, /^ (\S.*)$/) next end # there is no higer level, right? end $data.sort! do |a, b| if a["cat"] == b["cat"] ret = a["text"] <=> b["text"] else ret = a["cat"] <=> b["cat"] end ret end print "Text\tUrls\tCategory\n" if $print_header $data.each do |hash| print "#{hash['text']}\t#{hash['urls']}\t#{hash['cat']}\n" end end