# -*- coding: ascii -*-

###########################################################################
# clive, video extraction utility
# Copyright (C) 2007-2008 Toni Gundogdu
#
# clive is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 0.1.2-1307 USA
###########################################################################

## Classes for parsing video page HTML

import os
import sys
import urlparse
import urllib
import formatter
import time
import md5
import re
import string
import gzip

from htmllib import HTMLParser

try:
    from cStringIO import StringIO
except ImportError:
    import StringIO

from clive.path import ConfigDir
from clive.error import CliveError, CliveNoMediaError
try:
    from clive.urlgrabber.grabber import URLGrabber, URLGrabError
except ImportError, err:
    raise SystemExit('error: %s' % err)

__all__ = ['PageParser']


## The class for host specific video page HTML parsing
class PageParser:

    ## Constructor
    def __init__(self):
        self._supported_hosts = [
            ('youtube.com', 'ytube', self._parse_ytube),
            ('video.google.', 'vgoogle', self._parse_vgoogle),
            ('dailymotion.', 'dmotion', self._parse_dmotion),
            ('guba.com', 'guba', self._parse_guba),
            ('metacafe.', 'metac', self._parse_metacafe),
            ('sevenload.com', 'sevenl', self._parse_sevenload),
        ]

    ## Parses a video page data (HTML)
    def parse(self, url_data, batch, proxy):
        self._say = url_data['say_callb']
        self._opts = url_data['opts']
        self._proxy = proxy

        if len(url_data['title']) == 0:
            fmt = formatter.AbstractFormatter(formatter.NullWriter())
            p = HTMLParser(fmt)
            p.feed(url_data['data'])
            title = p.title
            p.close()
        else:
            title = url_data['title']

        xurl = '' # video eXtraction URL
        low_quality = url_data['low_quality']
        for (site, v_host, func) in self._supported_hosts:
            ##if url_data['url'].lower().find(site) != -1:
            if site in url_data['url'].lower():
                if len(url_data['xurl']) > 0:
                    xurl = url_data['xurl'] # From cache
                    v_id = url_data['v_id']
                else:
                    (xurl, v_id, low_quality) = \
                        func(url_data['url'], url_data['data'], low_quality)
                break

        if len(xurl) == 0:
            raise CliveError('error: extraction url not found')

        if url_data['file_length'] == -1:
            try:
                length = url_data['file_length_callb'](xurl)
            except CliveNoMediaError: # mp4 not avail., fallback to flv
                self._say('warn: no-media: switched to low quality')
                xurl = xurl.strip('&fmt=18')
                length = url_data['file_length_callb'](xurl)
                low_quality = 1 # !
        else:
            l = url_data['file_length']
            length = ('%.2fMB' % (float(l) / (1024*1024)), l)

        v_info = {
            'page_title':title,
            'url':url_data['url'],
            'xurl':xurl,
            'v_id':v_id,
            'v_host':v_host,
            'length':length[0],
            'length_bytes':length[1],
            'low_quality':low_quality,
        }
        self._get_filename(v_info, batch)
        return v_info

    def _parse_ytube(self, url, data, low_quality):
        try:
            vid = url.split('watch?v=',1)[1].split('&',1)[0]
        except IndexError:
            vid = self._random_vid()

        ldata = data.lower()

        a = [('verify you are 18','age verification: use --youtube-user'),
            ('no longer available','not available'),
            #('has been removed','video removed'),
            ('video is unavailable','video unavailable'),
            ('malformed video id','url contains malformed video id'),
            ('not available in your country','country restriction/censorship'),
            ('private video','private video')]

        for (lookup, errmsg) in a:
            ##if ldata.find(lookup) != -1:
            if lookup in ldata:
                raise CliveError('error: ' + errmsg)
        try:
            video_id = \
                self._parse_from_to(data, 'video_id=', '&', skip_from=1)
            video_id = video_id.replace("'", "")
            if len(video_id) == 0: raise CliveError()
        except:
            raise CliveError('error: extraction url (&video_id) not found')
		
        try:
            t = self._parse_from_to(data, '&t=', '&', skip_from=1)
            t = t.replace("'", "")
            if len(t) == 0: raise CliveError()
        except:
            raise CliveError('error: extraction url (&t) not found')

        url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (
            video_id, t)

        if not self._opts.enable_low_quality:
            url += '&fmt=18'
        return (url, vid, low_quality)

    def _parse_vgoogle(self, url, data, low_quality):
        try:
            vid = url.split('docid=',1)[1].split('&',1)[0]
        except IndexError:
            vid = self._random_vid()
        # flv
        url = self._parse_from_to(
            data.decode('unicode-escape').encode('latin-1'),
            'googleplayer.swf?videoUrl=', '"', skip_from=1)
        url = urllib.unquote(url)
        try:
            url = url.split('&thumb',1)[0]
        except IndexError:
            url = ''
        # mp4            
        if not self._opts.enable_low_quality:
            mp4 = self._parse_from_to(data,
                'right-click <a href="', '"', skip_from=1)
            if len(mp4) == 0:
                self._say('warn: no-media: switched to low quality')
                low_quality = 1 # !
            else:
                url = mp4
        return (url, vid, low_quality)
	
    def _parse_dmotion(self, url, data, low_quality):
        try:
            vid = url.rsplit('/',1)[1].split('_',1)[0]
        except IndexError:
            vid = self._random_vid()
        batch = self._parse_from_to(data, '"video", ', ');', skip_from=1)
        if len(batch) == 0:
            ldata = data.lower()
            a = [('content deleted','video removed'),
                ('users+have+flagged','age verification: use --dmotion-user')]
                #('explicit content','no-support; req. age verification')]
            for (lookup, errmsg) in a:
                ##if ldata.find(lookup) != -1:
                if lookup in ldata:
                    raise CliveError('error: ' + errmsg)
            raise CliveError('error: extraction url not found')
        batch = urllib.unquote(batch.strip('"')).split('||')
        d = {}
        for i in batch:
            s = i.split('/',4)
            d[s[3]] = i.split('@',1)[0]
        batch = sorted(d.items(), key=lambda(k,v):(v,k))
        url = 'http://dailymotion.com'
        if self._opts.enable_low_quality:
            url += dict(batch)['320x240']
        else:
            if batch[0][0] == '320x240':
                self._say('warn: no-media: switched to low quality')
                low_quality = 1 # !
            url += batch[0][1]
        return (url, vid, low_quality)
	
    def _parse_guba(self, url, data, low_quality):
        try:
            vid = url.split('watch/',1)[1].split('?',1)[0]
        except IndexError:
            vid = self._random_vid()
        url = self._parse_from_to(data,
            'http://free.guba.com/uploaditem/', '"')
        return (url, vid, low_quality)

    def _parse_metacafe(self, url, data, low_quality):
        ##if data.lower().find('adult confirmation') != -1:
        if 'adult confirmation' in data.lower():
            raise CliveError('error: no-support; req. age verification')
        try:
            vid = url.split('/watch/',1)[1].split('/')[0]
        except IndexError:
            vid = self._random_vid()
        url = self._parse_from_to(data, 'mediaURL=', '&', skip_from=1)
        return (url, vid, low_quality)

    def _parse_sevenload(self, url, data, low_quality):
        # We need to construct:
        #   http://flash.sevenload.com/player?itemId=[ID from URL]
        # that will give us an XML-File as a result, which contains the
        # Video-Location and a lot of other information that we discard atm
        try:
            vid = url.split('/videos/',1)[1].split('-')[0]
        except IndexError:
            # We cannot use a random string for video id as the
            # extraction depends on the video id further below.
            raise CliveError('error: no-videoid: url parsing failed')
        data = self._fetch_page(
            'http://flash.sevenload.com/player?itemId=' + vid)
        url = self._parse_from_to(data, 'video url="', '"', skip_from=1)
        return (url, vid, low_quality)

    def _fetch_page(self, url):
        g = URLGrabber(user_agent = self._opts.http_agent,
            http_headers = (('accept-encoding', 'gzip'),),
            throttle = self._opts.http_throttle,
            proxies = self._proxy)
        o = g.urlopen(url)
        data = o.read()
        if o.hdr.get('content-encoding') == 'gzip':
            data=gzip.GzipFile(fileobj=StringIO(data)).read()
        o.close()
        return data

    def _random_vid(self):
        return md5.new(str(time.time())).hexdigest()[:8]

    def _parse_from_to(self, data, _from, to, skip_from=0):
        start = data.find(_from)
        if skip_from and start != -1:
            start = start + len(_from)
        end = data.find(to, start)
        text = ''
        if start != -1 and end != -1:
            text = data[start:end]
        return text

    def _get_filename(self, v_info, batch):
        urlg_reget = None
        skip = 0
        ext = 'flv'
        offset = 0

        if v_info['v_host'] in ['ytube','dmotion','vgoogle']:
            if not v_info['low_quality']:
                ext = 'mp4'
                ##if v_info['url'].find('dailymotion.') != -1:
                if 'dailymotion.' in v_info['url'].lower():
                    end = v_info['xurl'].find('?') # copy ext from page html
                    ext = v_info['xurl'][end-3:end]

        title = v_info['page_title'].replace('YouTube -', '')
        title = title.replace('GUBA -', '')
        title = title.replace(' Video - Metacafe', '')
        try:
            ##if v_info['url'].lower().find('dailymotion.') != -1:
            if 'dailymotion.' in v_info['url'].lower():
                title = title.lstrip('Video ').split('-')[0].rstrip()
        except IndexError:
            pass
        if v_info['v_host'] == 'sevenl':
            title = v_info['page_title'].replace('Video "','')
            title = title.replace('" | sevenload','')
            ext = 'flv'

        if self._opts.output_mask == 'custom':
            sys.path.append(ConfigDir().dir())
            try:
                from custom import custom_output_mask
                title = custom_output_mask(title)
            except ImportError, err:
                self._say('error:%s: %s' % (ConfigDir().customfile(),err[0]))
                self._say('warn: ignoring --mask=custom, using default mask')
                title = re.sub('[^A-Za-z0-9]', '', title)
        else:
            if self._opts.output_mask:
                title = re.sub('[^%s]' % self._opts.output_mask, '', title)

        title = title.lstrip().rstrip()
        if len(title) == 0:
            title = self._random_string(insert_dash=0)

        if len(title) > 64:
            title = title[:64]

        filename = self._opts.output_format.replace('%','$')
        d = {'t':title,'i':v_info['v_id'],'h':v_info['v_host'],'e':ext}
        filename = string.Template(filename).substitute(d)

        if self._opts.output_file:
            filename = self._opts.output_file

        if self._opts.output_savedir:
            filename = os.path.join(self._opts.output_savedir, filename)

        if os.path.exists(filename) and not self._opts.emit_csv:
            if os.path.getsize(filename) < v_info['length_bytes']:
                a = ['dmotion', 'guba', 'metac', 'ytube','vgoogle','sevenl']
                if ext == 'flv':
                    # Continue partial download fails for flv files
                    # (ytube, vgoogle). mp4 transfers are ok.
                    if v_info['v_host'] == 'ytube':
                        a.remove('ytube')
                    if v_info['v_host'] == 'vgoogle':
                        a.remove('vgoogle')
                if v_info['v_host'] in a:
                    urlg_reget = 'simple' # Continue partial download
                    offset = os.path.getsize(filename)
                else:
                    if self._opts.output_exists != 'overwrite':
                        self._say('warn: no-support; cannot continue ' +
                            'partially downloaded file')
                        # Continue getting a partially-downloaded file
                        filename = self._rename_file(filename)
            else:
                if self._opts.output_exists == 'overwrite':
                    os.remove(filename)
                else:    
                    if os.path.getsize(filename) == v_info['length_bytes']:
                        skip = 1
                        self._say('notice: %s (same length) exists already' \
                            % os.path.basename(filename))
                    else:                                
                        filename = self._rename_file(filename)
        
        # Make sure there are not any duplicates in the URL batch
        for vi in batch:
            if vi['output_file'] == filename:
                filename = self._rename_file(filename) # Found
                break

        v_info['output_file'] = filename
        v_info['urlg_reget'] = urlg_reget
        v_info['skip_extraction'] = skip
        v_info['offset'] = offset
	
    def _rename_file(self, filename):
        (root, ext) = os.path.splitext(filename)
        return root + self._random_string() + ext

    def _random_string(self, insert_dash=1):
        s = ''
        if insert_dash: s += '-'
        s += (md5.new(str(time.time())).hexdigest()[:8])
        return s
        """
        s += time.strftime('-%Y-%m-%dT%H:%M:%S').replace(':', '_')
        """


