#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
#******************************************************************************\
#* Copyright (C) 2003-2004 Martin Blais <blais@furius.ca>
#*
#* This program is free software; you can redistribute it and/or modify
#* it under the terms of the GNU General Public License as published by
#* the Free Software Foundation; either version 2 of the License, or
#* (at your option) any later version.
#*
#* This program is distributed in the hope that it will be useful,
#* but WITHOUT ANY WARRANTY; without even the implied warranty of
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#* GNU General Public License for more details.
#*
#* You should have received a copy of the GNU General Public License
#* along with this program; if not, write to the Free Software
#* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#*
#*****************************************************************************/

"""html-wrap [<options>] [<directory>]

Script that looks at all the HTML files in a directory, reads them and creates a
mirror site that wraps them up in a template.

If you're processing the output of a recursive docutils-buildhtml run, you might
want to ignore all the .txt files, by using the '-I .*.txt' option.  Likewise,
if you're processing files which are under CVS you can use '-I CVS' to avoid
seeing the CVS admin directories appear in the output.

If you want to see images linked in the original tree, make sure you specify an
appropriate option for treatment of the non-HTML files.

"""

__version__ = "Revision: 1.13 "
__author__ = "Martin Blais <blais@furius.ca>"

# TO DO
# -----
#
# - fix entities problem: XML parsing doesn't have the HTML entities defined.
# - FIXME you need to find a nice way to specify a stylesheet

#===============================================================================
# EXTERNAL DECLARATIONS
#===============================================================================

import sys, os
from os.path import *
import re
import copy, shutil
import datetime

from elementtree import ElementTree

#===============================================================================
# LOCAL DECLARATIONS
#===============================================================================

#-------------------------------------------------------------------------------
#
def relpath(f, d):
    """Returns f relative to d."""

    sp1, sp2 = map(lambda x: realpath(x).split(os.sep), [f, d])
    while sp1 and sp2 and sp1[0] == sp2[0]:
        del sp1[0]
        del sp2[0]

    l = ['..'] * len(sp2) + sp1
    if l:
        rp = normpath(apply(join, l))
    else:
        rp = '.'
    #print '========', sp1, 'rel to', sp2, 'is', rp
    return rp

#-------------------------------------------------------------------------------
#
def doubfilter(filt, llist):
    """Applies given filter function, on given list, and retruns two lists: one
    for the matches and one for the non-matches."""

    pos, neg = [], []
    for e in llist:
        if filt(e):
            pos.append(e)
        else:
            neg.append(e)
    return pos, neg

#-------------------------------------------------------------------------------
#
def getiteratorp(self, tag=None, index=0, parent=None):
    """A version of Element.getiterator that also gives the parent. This is
    useful when you want to replace a node."""

    nodes = []
    if tag == "*":
        tag = None
    if tag is None or self.tag == tag:
        nodes.append( (self, index, parent) )
    i = 0
    for node in self._children:
        nodes.extend(node.getiteratorp(tag, i, self))
        i += 1
    return nodes

ElementTree._Element.getiteratorp = getiteratorp

#-------------------------------------------------------------------------------
#
def cleantree(otree, ns):
    """Clean up namespace crud in elementtree before outputting it."""

    # clean up namespace crud. (this is an elementtree specific workaround.)
    n = len(ns)
    for elem in otree.getiterator():
        if elem.tag.startswith(ns):
            elem.tag = elem.tag[n:]

#-------------------------------------------------------------------------------
#
def indenttree(node, indent='    ', level=0):
    """Inserts appropriate tail texts to render a nice output."""

    #node.text = node.text and node.text + '\n' or '\n'
    node.tail = node.tail and node.tail + '\n' or '\n'

    for c in node.getchildren():
        indenttree(c, indent, level+1)


#-------------------------------------------------------------------------------
#
def readhtml(fn):
    """Reads an HTML file and returns some parts of it."""

    try:
        tree = ElementTree.parse(fn)
        root = tree.getroot()
    except IOError, e:
        raise SystemExit("Error: couldn't read file (%s):" % fn + str(e))

    mo = re.match('({.*}).*', root.tag)
    ns = mo and mo.group(1) or ''

    title = ''
    for t in root.getiterator(ns + 'title'):
        title = t.text
        break # use the first one only

    cleantree(root, ns)

    return tree, root, title


#-------------------------------------------------------------------------------
#
class Common:

    def __repr__(self):
        return self.title

#-------------------------------------------------------------------------------
#
class HtmlDir(Common):

    """Represents a directory and its enclosing information."""

    def __init__(self, dn):
        self.fn = normpath(dn)
        indexfn = join(dn, opts.index)
        if exists(indexfn):
            self.tree, self.root, self.title = readhtml(indexfn)
        else:
            self.tree, self.title, = None, basename(dn)

        self.files = []
        self.dirs = []
        self.other = []
        self.parent = None

#-------------------------------------------------------------------------------
#
class HtmlFile(Common):

    """Represents the information about an HTML file and parsing code."""

    def __init__(self, fn):
        self.fn = normpath(fn)
        self.tree, self.root, self.title = readhtml(fn)
        self.parent = None

#-------------------------------------------------------------------------------
#
def procnav(otree, ff):
    """Perform replacement for navigation tag."""

    navigation = ElementTree.Element('div')
    navigation.set('id', 'template-navigation')

    if isinstance(ff, HtmlFile):
        cwd = dirname(ff.fn)
    else:
        cwd = ff.fn

    # build path of dirs
    ddirs = []
    fff = ff
    while fff:
        ddirs.append(fff)
        fff = fff.parent
    ddirs.reverse()

    for c in ddirs[:-1]:
        rp = relpath(join(c.fn, opts.index), cwd)
        #print '===', join(c.fn, opts.index), ',', cwd, '--->', rp
        href = ElementTree.SubElement(navigation, 'a', href=rp)
        href.text = c.title or basename(c.fn)
        href.tail = '    '

    te = ElementTree.SubElement(navigation, 'a')
    te.set('id', 'nav-entry')
    te.text = ff.title or basename(ff.fn)

    # change the name of the root element if it had no name set.
    sre = navigation.getiterator('a')[0]
    if sre.text == '.':
        sre.text = opts.root_name

    for node, index, parent in \
            otree.getroot().getiteratorp('div'):
        if node.get('id') == 'template-navigation':
            parent[ index ] = navigation

#-------------------------------------------------------------------------------
#
def getbody(tree, name=''):
    """Gets and prepares the body node within given tree."""

    try:
        body = tree.getiterator('body')[0]
    except IndexError, e:
        raise SystemExit("Error: no body in given tree: %s" % name)

    # Note: we just take the first body we find.
    body.tag = 'div'
    body.set('id', 'template-contents')

    return body

#-------------------------------------------------------------------------------
#
def addsidebar(body, sidebar):
    """Wraps the given body in a table with a sidebar."""

    table = ElementTree.Element('table')
    tr = ElementTree.SubElement(table, 'tr')
    td = ElementTree.SubElement(tr, 'td', valign='top')
    td.set('id', 'template-sidebar')
    sidebar_body = getbody(copy.deepcopy(sidebar), name='sidebar')
    td.append(sidebar_body)
    td = ElementTree.SubElement(tr, 'td')
    td.append(body)

    return table


#-------------------------------------------------------------------------------
#
def procfile(temptree, htmlfile, sidebar):
    """Perform template replacement within temptree using htmltree. Return the
    modified tree."""

    otree = copy.deepcopy(temptree)

    procnav(otree, htmlfile)

    body = getbody(htmlfile.tree, 'procfile')
    if sidebar:
        body = addsidebar(body, sidebar)

    # Maybe we should be looking for any type of element with the given id here.
    for node, index, parent in \
            otree.getroot().getiteratorp('div'):
        if node.get('id') == 'template-contents':
            # clear the contents of the template and add body as the child of
            # that div.
            node.clear()
            node.set('id', 'template-contents')
            node.append(body)

    return otree

#-------------------------------------------------------------------------------
#
def procindex(temptree, htmldir, sidebar):
    """Perform processing on an index, return the template tree."""

    otree = copy.deepcopy(temptree)

    procnav(otree, htmldir)

    skipindex = False
    if htmldir.tree:
        # look for a span with id="noindex", that would indicate for us not to
        # use the index.
        skipindex = False
        for node in htmldir.tree.getroot().getiterator('span'):
            try:
                if node.attrib['id'] == 'noindex':
                    skipindex = True
                    break
            except KeyError:
                pass

    index = None
    if not skipindex:
        index = ElementTree.Element('div')
        index.set('id', 'template-index')

        if htmldir.files:
            dirt = ElementTree.SubElement(index, 'h3')
            dirt.text = 'Document Index'
            ul = ElementTree.SubElement(index, 'ul')
            for ff in htmldir.files:
                li = ElementTree.SubElement(ul, 'li')
                a = ElementTree.SubElement(li, 'a', href=basename(ff.fn))
                a.text = ff.title or '[%s]' % ff.fn

        if htmldir.dirs:
            dirt = ElementTree.SubElement(index, 'h3')
            dirt.text = 'Subdirectories'
            ul = ElementTree.SubElement(index, 'ul')
            for dd in htmldir.dirs:
                li = ElementTree.SubElement(ul, 'li')
                a = ElementTree.SubElement(
                    li, 'a', href=join(basename(dd.fn), opts.index))
                a.text = dd.title or '%s' % dd.fn

        indenttree(index)

    if htmldir.tree:
        body = getbody(htmldir.tree, 'index')
        index2 = ElementTree.Element('div')
        index2.append(body)
        if index:
            index2.append(index)
        index = index2

    if sidebar:
        index = addsidebar(index, sidebar)

    for node, indexi, parent in \
            otree.getroot().getiteratorp('div'):
        if node.get('id') == 'template-contents':
            # clear the contents of the template and add body as the child of
            # that div.
            node.clear()
            node.set('id', 'template-contents')
            node.append(index)

    return otree

#-------------------------------------------------------------------------------
#
def genhier(ddir, temptree, sidebar=None):
    """Generates the hierarchy of files."""

    print 'dir', ddir.fn

    # create directory
    oroot = normpath(join(opts.output, ddir.fn))
    if not exists(oroot):
        try:
            os.mkdir(oroot)
        except IOError, e:
            raise SystemExit("Error: creating output dir" + str(e))

    # generate index
    ofn = normpath(join(oroot, opts.index))
    try:
        print '  ', ofn
        otree = procindex(temptree, ddir, sidebar)
        otree.write(ofn)
    except Exception, e:
        print >> sys.stderr, "Error: processing index at '%s'" % ddir.fn
        raise

    # generate files
    for ff in ddir.files:
        ofn = normpath(join(opts.output, ff.fn))

        try:
            print '  ', ofn
            otree = procfile(temptree, ff, sidebar)
            otree.write(ofn)
        except Exception, e:
            print >> sys.stderr, "Error: processing '%s'" % ff.fn
            raise

    # call recursively for subdirs
    for dc in ddir.dirs:
        genhier(dc, temptree, sidebar)


#-------------------------------------------------------------------------------
#
def genother(ddir, method='copy'):
    """Applies appropriate treament for the non-HTML files."""

    # copy files
    for fn in ddir.other:
        ifn = normpath(join(ddir.fn, fn))
        ofn = normpath(join(opts.output, ddir.fn, fn))
        print method, ofn, 'for', ifn

        if exists(ofn) or islink(ofn):
            print '  deleting', ofn
            os.unlink(ofn)

        if method == 'copy':
            print '  copying %s to %s' % (ifn, ofn)
            shutil.copy2(ifn, ofn)
        elif method == 'rellink':
            src = relpath(ifn, dirname(ofn))
            print '  linking %s -> %s' %(ofn, src)
            os.symlink(src, ofn)
        elif method == 'abslink':
            ifn = abspath(ifn)
            print '  linking %s -> %s' %(ofn, ifn)
            os.symlink(ifn, ofn)

        print

    # call recursively for subdirs
    for dc in ddir.dirs:
        genother(dc, method)


#===============================================================================
# MAIN
#===============================================================================

def main():
    import optparse
    parser = optparse.OptionParser(__doc__.strip(), version=__version__)
    parser.add_option('-f', '--overwrite', action='store_true',
                      help="overwrite/force output files without checking")
    parser.add_option('-t', '--template', action='store', metavar='FILE',
                      help="specify input template to use")
    parser.add_option('-S', '--stylesheet', action='store', metavar='URL',
                      help="specify stylesheet to use for default template")
    parser.add_option('-o', '--output', action='store', default='html',
                      metavar='DIR',
                      help="specify output directory (%s)" % 'html')
    parser.add_option('-i', '--index', action='store', default='index.html',
                      metavar='FILENAME',
                      help="specify the name of the index files.")
    parser.add_option('-r', '--root-name', action='store', default='(root)',
                      metavar='DIR',
                      help="specify the root element name.")
    parser.add_option('-s', '--sidebar', action='store', metavar='FILE',
                      help="specify an html file to include in sidebar.")
    parser.add_option('-I', '--ignore', action='append', metavar='REGEXP',
                      default=[],
                      help="""specify a regular expression for files and
                      directories to ignore.""")

    group = optparse.OptionGroup(parser, "Treatment of non-HTML files",
                                 """Specify an action to take for other
                                 files. No action means ignore the other
                                 files.""")

    group.add_option('-C', '--copy-other', action='store_const',
                     dest='other_files', const='copy',
                     help="copy non-HTML files to output directory.")

    group.add_option('-R', '--rellink-other', action='store_const',
                     dest='other_files', const='rellink',
                     help="""make relative symbolic links to non-HTML files
                     from output directory.""")

    group.add_option('-A', '--abslink-other', action='store_const',
                     dest='other_files', const='abslink',
                     help="""make absolute symbolic links to non-HTML files
                     from output directory.""")

    # group.add_option('-S', '--speclink-other', action='store', metavar='URL',
    #                  dest='other_files', const='speclink',
    #                  help="""make explicitly specified symbolic links to
    #                  non-HTML files using the given prefix.""")

    parser.add_option_group(group)

    global opts
    opts, args = parser.parse_args()

    def phase(p):
        print
        print p
        print '-' * len(p)

    if not args:
        args = ['.']
    elif len(args) > 1:
        raise SystemExit("Error: you must specify at most one input directory.")
    inroot = args[0]

    if not opts.overwrite and exists(opts.output):
        raise SystemExit(
            "Error: output directory exists. Use --overwrite to disregard.")

    if opts.template and opts.stylesheet:
        raise SystemExit("Error: custom stylesheet can only be applied when"
                         "using the default template")

    # compile ignore regexps
    if opts.ignore:
        try:
            opts.ignore = map(re.compile, opts.ignore)
        except re.error, e:
            raise SystemExit("Error: cannot compile ignore regexps.")

    if not opts.template:
        import StringIO
        if opts.stylesheet:
            style = '@import url("%s");' % opts.stylesheet
        else:
            style = ''
        ttext = def_template % style
        template = StringIO.StringIO(ttext)
    else:
        template = opts.template

    phase('Reading template file')
    try:
        temptree, broot, btitle = readhtml(template)
    except Exception, e:
        raise SystemExit("Error: parsing template:" + str(e))

    sidebar = None
    if opts.sidebar:
        phase('Reading sidebar file')
        try:
            sidebar, sbroot, sbtitle = readhtml(opts.sidebar)
        except Exception, e:
            raise SystemExit("Error: parsing sidebar:" + str(e))

    phase('Reading input html files')
    ddirs = {}
    htmlre = re.compile('^.*.html?$')
    for root, dirs, files in os.walk(inroot, topdown=1):
        if exists(opts.output) and samefile(root, opts.output):
            dirs[:] = []
            continue

        def ignmatch(fn):
            for r in opts.ignore:
                if r.match(fn):
                    return 1
            return None
        didx = []
        for i in xrange(len(dirs)):
            if ignmatch(dirs[i]):
                didx.append(i)
        didx.reverse()
        for i in didx:
            del dirs[i]

        relroot = relpath(root, inroot)
        print '  DIR', relroot

        d = HtmlDir(relroot)
        if relroot != '.':
            d.parent = ddirs[ dirname(root) ]
            d.parent.dirs.append(d)
        ddirs[root] = d

        # ignore the files too
        ignfiles, files = doubfilter(ignmatch, files)
        if ignfiles:
            print '    IGNORED', ' '.join(ignfiles)
        htmlfiles, otherfiles = doubfilter(htmlre.match, files)
        d.other = otherfiles

        parselist = []
        for fn in htmlfiles:
            if fn == opts.index:
                continue

            afn = join(root, fn)

            # ignore sidebar file if necessary
            if opts.sidebar and samefile(afn, opts.sidebar):
                continue
            if opts.template and samefile(afn, opts.template):
                continue

            print '    FILE', afn
            f = HtmlFile(afn)
            f.parent = d
            parselist.append(f)
            print '      ', f.title

        d.files.extend(parselist)

    phase('Performing output and replacement of converted files')

    if not exists(opts.output):
        try:
            os.mkdir(opts.output)
        except IOError, e:
            raise SystemExit("Error: creating output dir" + str(e))

    topdir = ddirs[inroot]
    genhier(topdir, temptree, sidebar)

    if opts.other_files:
        phase('Performing output and replacement of converted files')

        genother(topdir, opts.other_files)

    phase('Output log file')

    logfn = join(opts.output, 'html-wrap.log')
    print 'Writing', logfn
    try:
        f = open(logfn, 'w')
        print >> f, 'Command:'
        print >> f, ' '.join(sys.argv)
        print >> f
        print >> f, 'Generated on:', datetime.datetime.now()
        print >> f
    except IOError, e:
        raise SystemExit("Error: writing log file (%s)" % str(e))


#-------------------------------------------------------------------------------
#
def_template = """<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
                      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>

<style>

%s

body {
   margin: 0px; }

#template-navigation {
   background-color: #DDDDDD;
   padding: 0.3em 1em 0.3em 1em;
   border-bottom: 2px solid black; }

#template-contents {
   margin-left: 0.5em;
   margin-right: 0.5em; }

#template-navigation a.nav-entry,
#template-navigation a:link,
#template-navigation a:visited {
  text-decoration: none ;
  font-style: italic }

#template-sidebar {
   background-color: #F0F0F0;
   float: left ;
   margin: 0.5em ;
   padding: 0.3em ;
   border: thin dashed #999999 }

</style>

</head>
<body>

<div id="template-navigation" />

<div id="template-contents" />

</body>
</html>
"""

#-------------------------------------------------------------------------------
#
if __name__ == '__main__':
    main()
