#
# Convert Radio Userland post archives to Movable Type import format
#
# This code is in public domain.
# There is absolutely no warranty that it'll work.
#
# To call it:
# radio2mt.py "c:\Program Files\Radio UserLand\backups\weblogArchive\posts"

import sys, re, mx.DateTime as dt, htmlentitydefs, string, os

postArchivesDir = "c:\\Program Files\\Radio UserLand\\backups\\weblogArchive\\posts\\"

# those are MT per-post settings
fAllowComments = 0
fConvertBreaks = 0
fAllowPings = 0
primaryCategory = ""
author = "kjk"
status = "publish"

# for a fresh blog it should be 1
mtPostNum = 1
radioRoot = "http://radio.weblogs.com/0109158"

# how many posts to convert, set to -1 for infinity
maxPostsToConvert = -1

# should we generate "TITLE" part of MT log entry
fDoTitle = 1

def isLetter( c ):
    if -1 != string.find( string.letters, c ):
        return 1
    return 0

def stripTags( txt ):
    return re.sub('<([^>]*)>', '', txt)

def extractTitleFromBody(body):
    # intelligently extract title from the body of a post
    # title is either the thing that was in a first pair
    # of <b></b> or first 5 words (up to a ".")
    maxWordsCount = 5
    getInBold = re.compile( '<b>(.*)</b>', re.S|re.M|re.I )
    inBold = getInBold.search(body)

    if inBold:
        # we matched something
        return stripTags(inBold.groups()[0])

    getInBold2 = re.compile( '<STRONG>(.*)</STRONG>', re.S|re.M|re.I )
    inBold = getInBold2.search(body)
    if inBold:
        return stripTags(inBold.groups()[0])

    # remove all markup from the text
    body = stripTags(body)
    
    # get first maxWordsCount words up to a dot
    strLen = len(body)
    word = 0
    currPos = 0
    while currPos < len:
        c = body[currPos]
        if c == '.':
            # we found dot, so we think this is an end of the sentence,
            # so the title is everything up to here
            title = body[:currPos]
            return stripTags(title)
        if not isLetter( c ):
            word += 1
            if word == 5:
                title = body[:currPos] + "..."
                return stripTags(title)
        currPos += 1
    # the whole body is shorter than 5 words, so return it all
    return stripTags(body)

def genTxtWithFlag( txt, flag ):
    if flag:
        return txt + "1\n"
    else:
        return txt + "0\n"

dayToPostNumMap = {}

def getOneEntryFromFile( fileName ):
    global fAllowComments, fConvertBreaks, fAllowPings, primaryCategory
    global author, status, fDoTitle
    global mtPostNum, dayToPostNumMap, radioRoot
    mtPostNum += 1

    fNice = 1

    entry = open(fileName).read()
    getText = re.compile('<string name="text" value="([^"]*)"', re.S|re.M)
    getDate = re.compile('<date name="when" value="([^"]*)', re.S|re.M)

    dateTime = dt.DateTimeFrom(getDate.search(entry).groups()[0])
    date = dateTime.Format('%m/%d/%Y %H:%M:%S')

    body = getText.search(entry).groups()[0]

    # now add a mapping of this date in format YYYY/MM/DD to mtPostNum
    # create the date string in a right format
    txtDate = "%04d/%02d/%02d" % (dateTime.year, dateTime.month, dateTime.day)
    #print "date: %s matches post no %d" % (txtDate,mtPostNum)
    dayToPostNumMap[ txtDate ] = mtPostNum

    body = re.sub('&apos;', "'", body)
    for e in htmlentitydefs.entitydefs:
	body = re.sub('&'+e+';', htmlentitydefs.entitydefs[e], body)

    # now replace $radioRoot/YYYY/MM/DD.html links
    # with appropriate mtPostNum (i.e. /archives/000($mtPostNum).html
    getDate = re.compile( radioRoot + '/([\d/]+).html' )
    dateMatch = re.search(getDate,body)
    if dateMatch:
        dateTxt = dateMatch.groups()[0]
        newUrl = "/archives/%06d.html" % dayToPostNumMap[ dateTxt ]
        # print "found date in an url: %s, new url: %s" % (dateTxt,newUrl)
        # now replace url
        body = re.sub( radioRoot + '/(.*).html[^"]*',newUrl,body)

    # remove enclosing <p>...</p> from posts, so that they display nicely
    #getInPara = re.compile( '^<P>(.*)</P>', re.S|re.M|re.I )
    inPat = re.compile( '^<P>(.*)</P>$', re.S|re.M|re.I )
    inPara = inPat.search(body)
    if inPara:
        body = inPara.groups()[0]

    inPat = re.compile( '<P>(.*)</P>$', re.S|re.M|re.I )
    inPara = inPat.search(body)
    if inPara:
        body = inPara.groups()[0]

    #body = re.sub('&#013;&#010;', '<p/>', body)

    result = ""

    if author != "":
        result += "AUTHOR: " + author + "\n"

    if fDoTitle:
        title = extractTitleFromBody( body )
        result += "TITLE: " + title + "\n"

    result += genTxtWithFlag( "ALLOW COMMENTS: ", fAllowComments );
    result += genTxtWithFlag( "CONVERT BREAKS: ", fConvertBreaks );
    result += genTxtWithFlag( "ALLOW PINGS: ", fAllowPings );
    
    result += "PRIMARY CATEGORY:" + primaryCategory + "\n"

    result += "DATE:" + date + "\n"
    result += "-----\n"
    result += "BODY:\n"
    result += body + "\n"
    result += "--------\n"
    return result

archiveDir = postArchivesDir
if len(sys.argv) > 1:
    archiveDir = sys.argv[1]

files = os.listdir( postArchivesDir )
files.sort()
res = ""
count = maxPostsToConvert
for f in files:
    fileName = archiveDir + f
    res += getOneEntryFromFile( fileName )
    count -= 1
    if 0 == count:
        break
print res