DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
I Can't Be Bothered Going Downstairs To Watch The Daily Show
scraper.rb - scrapes metadata from thedailyshow.com.
Just run it - creates videos/index.yml
#!/usr/bin/ruby
require 'rubygems'
require 'leecher'
require 'open-uri'
require 'hpricot'
class Scraper
SEARCH_PAGE="http://www.thedailyshow.com/tds_files/includes/search/search_results.jhtml"
def scrape_day(site, date, force=false)
return if site.skip.include?(date) and not force
url = sprintf("#{SEARCH_PAGE}?searchterm=%02d-%02d-%04d",date.month,date.day,date.year)
puts "Fetching #{url}" if $DEBUG
data = open(url) {|f| f.read }
h = Hpricot(data)
# can't use a real xpath - they all use an _id_ istead of a class
results = (h/"div").find_all {|x| x['id'] == "videoListItem_1" }
results.each {|result|
url = result.at("a[1]")['href']
vid = if url =~ /videoId=(\d+)/
$1.to_i
else
raise "Failed to parse link #{url}"
end
title = ((result/"a")[1]/"text()").to_s
date = Date.parse((result/"a[@onclick][1]/text()").to_s)
descr = result/"div.video_description"
description = (descr/"div[1]/text()").to_s
tags = (descr/".tags/a/text()").map {|x| unescape(x.to_s) }
vid = Video.new(site, vid, date, unescape(title), unescape(description), tags)
puts vid
}
results.length
end
def unescape(t)
t.gsub(/&([^;]{1,5});/) {|ent|
case $1
when /^#(\d+)$/
[$1.to_i].pack('C')
when /^#x([0-9a-zA-Z]+)$/
[$1].pack('H2')
when 'amp'
'&'
when 'gt'
'>'
when 'quot'
'"'
when 'apos'
"'"
when 'lt'
'<'
else
$stderr.puts "Unknown entity #{$1.inspect}"
'?'
end
}
end
end
if __FILE__ == $0
Site.load
tds = Site.sites.find {|s| s.short_name == "tds" }
tds ||= Site.new("The Daily Show","tds")
day = Date.today
start = Date.new(1999)
counter=0
s = Scraper.new
while day >= start
results = s.scrape_day(tds, day)
puts "#{results} results for #{day}" if results # else skipped
# Mark days as done once we've scraped them a month after air
if (Date.today - day) > 30 and not results.nil?
tds.skip!(day)
end
day -= 1
unless results.nil? or results.zero?
Site.save if (counter += 1)%10 == 0
end
end
Site.save
end
leecher.rb - downloads/searches/plays videos ./leecher.rb [download/list/play] [searchterm ... ] Search terms can be date: 2007 or 2007-10 or 2007-10-01 tag: interview id: 31723 already downloaded?: downloaded or !downloaded
#!/usr/bin/ruby
MEDIA_PLAYER = %w{mplayer -fs}
require 'rubygems'
require 'open-uri'
require 'rexml/document'
require 'rexml/xpath'
require 'fileutils'
require 'yaml'
require 'rio'
require 'set'
class Site
class << self
attr_reader :base
attr_reader :alternates
attr_reader :sites
def init
unless self.base or self.sites
@base = "./videos"
@alternates = []
@sites = []
end
end
def load(stream=nil)
if stream
stuff = YAML::load(stream)
@base = stuff['base'] || "./videos"
@sites = stuff['sites'] || []
@alternates = stuff['alternates'] || []
else
init
begin
File.open(File.join(base,'index.yml')) {|f|
load(f)
}
rescue Errno::ENOENT
$stderr.puts "Warning, no database found, starting a new one"
end
end
end
def save(stream=nil)
if stream
YAML::dump({'base' => base, 'sites' => sites, 'alternates' => 'alternates'},stream)
else
File.open(File.join(base,'index.yml_'),'w') {|f|
save(f)
}
FileUtils.mv(File.join(base,'index.yml_'),File.join(base,'index.yml'))
end
end
def each(&block)
sites.each(&block)
end
end
def initialize(name, short_name=name)
Site.init
@videos = {}
@name, @short_name = name, short_name
@skip = Set.new
Site.sites << self
end
attr_reader :videos
attr_reader :name
attr_reader :short_name
attr_reader :skip
def directory
File.join(Site.base, short_name)
end
def directory_alternates
Site.alternates.map {|d| File.join(d, short_name) }
end
def ensure_dir_exists!
FileUtils.mkpath(directory)
end
def <<(vid)
self.videos[vid.id] = vid
end
def skip!(date)
self.skip << date
end
def [](id)
self.videos[id]
end
def to_s
name
end
def each
self.videos.each {|k,v| yield v }
end
end
class Video
attr_reader :tags
attr_reader :site
attr_reader :id
attr_reader :date
attr_reader :title
attr_reader :description
def initialize(site, id, date=nil, title = nil, description=nil, tags=[])
@site = site
@id = id
@title = title
@tags = tags
@date = date
@description = description
site << self
end
def filename
site.directory_alternates.map{|x|
File.join(x,"#{id}.flv")
}.find {|f|
File.exists?(f)
} || File.join(site.directory, "#{id}.flv")
end
def downloaded?
File.exists?(filename)
end
def download
download! unless downloaded?
end
SHARED_DATA = "http://www.comedycentral.com/sitewide/video_player/shared/data"
def download!
site.ensure_dir_exists!
url = download_url()
begin
rio(url) > rio(filename)
File.size(filename)
rescue Exception => x
begin
File.delete(filename)
rescue Exception
end
raise x
end
end
def to_s
sprintf("[%1s %7d - %s - %s - %20s]",(downloaded?? 'D' : ' '), id, date, site, title)
end
def download_url
manifest = open("#{SHARED_DATA}/flv_xml_gen.jhtml?ml_video=#{id}&hiLoPref=hi") {|f| f.read }
doc = REXML::Document.new(manifest)
REXML::XPath.first(doc, "/package/video/item/src/text()").to_s
end
end
class Filter
class << self
def method_missing(sym,*args,&block)
if sym.to_s =~ /^by/
new.send(sym,*args,&block)
else
super
end
end
end
def initialize(parent=nil,&block)
@parent = parent
@test = block
end
def [](video)
case video
when Video
video if (@test.nil? or @test[video]) and (@parent.nil? or @parent[video])
when Site
if block_given?
video.each {|v| yield v if self[v] }
else
video.find_all {|v| self[v] }
end
else
raise ArgumentError
end
end
def each(&block)
Site.each {|show| self.send(:[], show, &block) }
end
def filter(&block)
Filter.new(self,&block)
end
def by_downloaded(dl=true)
filter {|v| v.downloaded? == !!dl }
end
def by_date(y, m=nil, d=nil)
if m.nil? and d.nil?
filter {|v| v.date and v.date.year == y }
elsif d.nil?
filter {|v| v.date and v.date.year == y and v.date.month == m }
else
filter {|v| v.date and v.date.year == y and v.date.month == m and v.date.day == d }
end
end
def by_id(vid)
filter {|v| v.id == vid }
end
def by_tag(tag)
filter {|v| v.tags.include? tag }
end
def by_text(text)
filter {|v|
v.title && v.title.downcase.include?(text.downcase) or
v.description && v.description.downcase.include?(text.downcase) or
v.tags.include? text
}
end
def by(arg)
arg = arg.to_s
case arg
when 'downloaded'
by_downloaded(true)
when '!downloaded'
by_downloaded(false)
when /^(\d{4})$/
by_date($1.to_i)
when /^(\d{4})-(\d{1,2})$/
by_date($1.to_i, $2.to_i)
when /^(\d{4})-(\d{1,2})-(\d{1,2})$/
by_date($1.to_i, $2.to_i, $3.to_i)
when /^\d{5,}$/
by_id(arg.to_i)
else
by_text(arg)
end
end
end
if __FILE__ == $0
command = ARGV.shift or raise "Usage: leecher <download/list> [filters...]"
action = case command.downcase
when 'download'
proc {|v|
puts "Fetching #{v}" unless v.downloaded?
begin
v.download
rescue OpenURI::HTTPError => x
$stderr.puts "Download failed, skipping: #{x}"
rescue Errno::ENOENT => xm
$stderr.puts "Download failed, skipping: #{x}"
end
}
when 'list'
proc {|v|
puts "#{v} #{v.tags.join(', ')}"
}
when 'play'
proc {|v|
puts v
system(*MEDIA_PLAYER, v.filename)
}
else
raise "Unknown command #{command}"
end
filter = ARGV.inject(Filter.new) {|f, arg| f.by(arg)}
Site.load
filter.each(&action)
end





