DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

I Can't Be Bothered Going Downstairs To Watch The Daily Show

11.18.2007
| 2868 views |
  • submit to reddit
        scraper.rb - scrapes metadata from thedailyshow.com.
Just run it - creates videos/index.yml
#!/usr/bin/ruby

require 'rubygems'
require 'leecher'
require 'open-uri'
require 'hpricot'

class Scraper
	SEARCH_PAGE="http://www.thedailyshow.com/tds_files/includes/search/search_results.jhtml"
	def scrape_day(site, date, force=false)
		return if site.skip.include?(date) and not force
		url = sprintf("#{SEARCH_PAGE}?searchterm=%02d-%02d-%04d",date.month,date.day,date.year)
		puts "Fetching #{url}" if $DEBUG

		data = open(url) {|f| f.read }
		h = Hpricot(data)

		# can't use a real xpath - they all use an _id_ istead of a class
		results = (h/"div").find_all {|x| x['id'] == "videoListItem_1" }
		results.each {|result|
			url = result.at("a[1]")['href']
			vid = if url =~ /videoId=(\d+)/
				$1.to_i
			else
				raise "Failed to parse link #{url}"
			end

			title = ((result/"a")[1]/"text()").to_s
			date = Date.parse((result/"a[@onclick][1]/text()").to_s)

			descr = result/"div.video_description"
			description = (descr/"div[1]/text()").to_s
			tags = (descr/".tags/a/text()").map {|x| unescape(x.to_s) }
			
			vid = Video.new(site, vid, date, unescape(title), unescape(description), tags)
			puts vid
		}
	
		results.length
	end

	def unescape(t)
		t.gsub(/&([^;]{1,5});/) {|ent|
			case $1
				when /^#(\d+)$/
					[$1.to_i].pack('C')
				when /^#x([0-9a-zA-Z]+)$/
					[$1].pack('H2')
				when 'amp'
					'&'
				when 'gt'
					'>'
				when 'quot'
					'"'
				when 'apos'
					"'"
				when 'lt'
					'<'
				else
					$stderr.puts "Unknown entity #{$1.inspect}"
					'?'
			end
		}
	end
end

if __FILE__ == $0
	Site.load
	tds = Site.sites.find {|s| s.short_name == "tds" }
	tds ||= Site.new("The Daily Show","tds")

	day = Date.today
	start = Date.new(1999)

	counter=0

	s = Scraper.new
	while day >= start
		results = s.scrape_day(tds, day)
		puts "#{results} results for #{day}" if results # else skipped

		# Mark days as done once we've scraped them a month after air
		if (Date.today - day) > 30 and not results.nil?
			tds.skip!(day)
		end

		day -= 1
		unless results.nil? or results.zero?
			Site.save if (counter += 1)%10 == 0
		end
	end
	Site.save
end

leecher.rb - downloads/searches/plays videos
./leecher.rb [download/list/play] [searchterm ... ]
Search terms can be 
  date: 2007 or 2007-10 or 2007-10-01
  tag: interview
  id: 31723
  already downloaded?: downloaded or !downloaded
#!/usr/bin/ruby

MEDIA_PLAYER = %w{mplayer -fs}

require 'rubygems'
require 'open-uri'
require 'rexml/document'
require 'rexml/xpath'
require 'fileutils'
require 'yaml'
require 'rio'
require 'set'

class Site
	class << self
		attr_reader :base
		attr_reader :alternates
		attr_reader :sites
		
		def init
			unless self.base or self.sites
				@base = "./videos"
				@alternates = []
				@sites = []
			end
		end

		def load(stream=nil)
			if stream
				stuff = YAML::load(stream)
				@base = stuff['base'] || "./videos"
				@sites = stuff['sites'] || []
				@alternates = stuff['alternates'] || []
			else
				init
				begin
					File.open(File.join(base,'index.yml')) {|f|
						load(f)
					}
				rescue Errno::ENOENT
					$stderr.puts "Warning, no database found, starting a new one"
				end
			end
		end
		
		def save(stream=nil)
			if stream
				YAML::dump({'base' => base, 'sites' => sites, 'alternates' => 'alternates'},stream)
			else
				File.open(File.join(base,'index.yml_'),'w') {|f|
					save(f)
				}
				FileUtils.mv(File.join(base,'index.yml_'),File.join(base,'index.yml'))
			end
		end

		def each(&block)
			sites.each(&block)
		end
	end

	def initialize(name, short_name=name)
		Site.init
		@videos = {}
		@name, @short_name = name, short_name
		@skip = Set.new
		Site.sites << self
	end

	attr_reader :videos
	attr_reader :name
	attr_reader :short_name
	attr_reader :skip

	def directory
		File.join(Site.base, short_name)
	end
	def directory_alternates
		Site.alternates.map {|d| File.join(d, short_name) }
	end

	def ensure_dir_exists!
		FileUtils.mkpath(directory)
	end

	def <<(vid)
		self.videos[vid.id] = vid
	end

	def skip!(date)
		self.skip << date
	end

	def [](id)
		self.videos[id]
	end

	def to_s
		name
	end

	def each 
		self.videos.each {|k,v| yield v }	
	end
end

class Video
	attr_reader :tags
	attr_reader :site
	attr_reader :id
	attr_reader :date
	attr_reader :title
	attr_reader :description

	def initialize(site, id, date=nil, title = nil, description=nil, tags=[]) 
 		@site = site
		@id = id
		@title = title
		@tags = tags
		@date = date
		@description = description

		site << self
	end

	def filename
		site.directory_alternates.map{|x| 
			File.join(x,"#{id}.flv")
		}.find {|f| 
			File.exists?(f) 
		} || File.join(site.directory, "#{id}.flv")
	end

	def downloaded?
		File.exists?(filename)
	end

	def download
		download! unless downloaded?
	end

	SHARED_DATA = "http://www.comedycentral.com/sitewide/video_player/shared/data"
	def download!
		site.ensure_dir_exists!
		url = download_url()
		begin
			rio(url) > rio(filename)
			File.size(filename)
		rescue Exception => x
			begin
				File.delete(filename)
			rescue Exception
			end
			raise x
		end
	end

	def to_s 
		sprintf("[%1s %7d - %s - %s - %20s]",(downloaded?? 'D' : ' '), id, date, site, title) 
	end

	def download_url
		manifest = open("#{SHARED_DATA}/flv_xml_gen.jhtml?ml_video=#{id}&hiLoPref=hi") {|f| f.read }
		doc = REXML::Document.new(manifest)
		REXML::XPath.first(doc, "/package/video/item/src/text()").to_s
	end
end

class Filter
	class << self
		def method_missing(sym,*args,&block)
			if sym.to_s =~ /^by/
				new.send(sym,*args,&block)
			else
				super
			end
		end
	end

	def initialize(parent=nil,&block)
		@parent = parent
		@test = block
	end
	
	def [](video)
		case video
			when Video
				video if (@test.nil? or @test[video]) and (@parent.nil? or @parent[video])
			when Site
				if block_given?
					video.each {|v| yield v if self[v] }
				else
					video.find_all {|v| self[v] }
				end
			else
				raise ArgumentError
		end
	end

	def each(&block)
		Site.each {|show| self.send(:[], show, &block) }
	end

	def filter(&block)
		Filter.new(self,&block)
	end

	def by_downloaded(dl=true)
		filter {|v| v.downloaded? == !!dl }
	end

	def by_date(y, m=nil, d=nil)
		if m.nil? and d.nil?
			filter {|v| v.date and v.date.year == y }
		elsif d.nil?
			filter {|v| v.date and v.date.year == y and v.date.month == m }
		else
			filter {|v| v.date and v.date.year == y and v.date.month == m and v.date.day == d }
		end
	end

	def by_id(vid)
		filter {|v| v.id == vid }
	end

	def by_tag(tag)
		filter {|v| v.tags.include? tag }
	end

	def by_text(text) 
		filter {|v| 
			v.title && v.title.downcase.include?(text.downcase) or 
			v.description && v.description.downcase.include?(text.downcase) or
			v.tags.include? text
		}
	end

	def by(arg)
		arg = arg.to_s
		case arg
			when 'downloaded'
				by_downloaded(true)
			when '!downloaded'
				by_downloaded(false)
			when /^(\d{4})$/
				by_date($1.to_i)
			when /^(\d{4})-(\d{1,2})$/
				by_date($1.to_i, $2.to_i)
			when /^(\d{4})-(\d{1,2})-(\d{1,2})$/
				by_date($1.to_i, $2.to_i, $3.to_i)
			when /^\d{5,}$/
				by_id(arg.to_i)
			else
				by_text(arg)
		end
	end
end

if __FILE__ == $0
	command = ARGV.shift or raise "Usage: leecher <download/list> [filters...]"
	action = case command.downcase
		when 'download'
			proc {|v| 
				puts "Fetching #{v}" unless v.downloaded?
				begin
					v.download
				rescue OpenURI::HTTPError => x
					$stderr.puts "Download failed, skipping: #{x}"
				rescue Errno::ENOENT => xm
					$stderr.puts "Download failed, skipping: #{x}"
				end
			}
		when 'list'
			proc {|v|
				puts "#{v} #{v.tags.join(', ')}"
			}
		when 'play'
			proc {|v|
				puts v
				system(*MEDIA_PLAYER, v.filename)
			}
		else
			raise "Unknown command #{command}"
	end

	filter = ARGV.inject(Filter.new) {|f, arg| f.by(arg)}

	Site.load
	filter.each(&action)
end