import sys, re, mx.DateTime as dt, htmlentitydefs, string, os
postArchivesDir = "c:\\Program Files\\Radio UserLand\\backups\\weblogArchive\\posts\\"
fAllowComments = 0
fConvertBreaks = 0
fAllowPings = 0
primaryCategory = ""
author = "kjk"
status = "publish"
mtPostNum = 1
radioRoot = "http://radio.weblogs.com/0109158"
maxPostsToConvert = -1
fDoTitle = 1
def isLetter( c ):
if -1 != string.find( string.letters, c ):
return 1
return 0
def stripTags( txt ):
return re.sub('<([^>]*)>', '', txt)
def extractTitleFromBody(body):
maxWordsCount = 5
getInBold = re.compile( '<b>(.*)</b>', re.S|re.M|re.I )
inBold = getInBold.search(body)
if inBold:
return stripTags(inBold.groups()[0])
getInBold2 = re.compile( '<STRONG>(.*)</STRONG>', re.S|re.M|re.I )
inBold = getInBold2.search(body)
if inBold:
return stripTags(inBold.groups()[0])
body = stripTags(body)
strLen = len(body)
word = 0
currPos = 0
while currPos < len:
c = body[currPos]
if c == '.':
title = body[:currPos]
return stripTags(title)
if not isLetter( c ):
word += 1
if word == 5:
title = body[:currPos] + "..."
return stripTags(title)
currPos += 1
return stripTags(body)
def genTxtWithFlag( txt, flag ):
if flag:
return txt + "1\n"
else:
return txt + "0\n"
dayToPostNumMap = {}
def getOneEntryFromFile( fileName ):
global fAllowComments, fConvertBreaks, fAllowPings, primaryCategory
global author, status, fDoTitle
global mtPostNum, dayToPostNumMap, radioRoot
mtPostNum += 1
fNice = 1
entry = open(fileName).read()
getText = re.compile('<string name="text" value="([^"]*)"', re.S|re.M)
getDate = re.compile('<date name="when" value="([^"]*)', re.S|re.M)
dateTime = dt.DateTimeFrom(getDate.search(entry).groups()[0])
date = dateTime.Format('%m/%d/%Y %H:%M:%S')
body = getText.search(entry).groups()[0]
txtDate = "%04d/%02d/%02d" % (dateTime.year, dateTime.month, dateTime.day)
dayToPostNumMap[ txtDate ] = mtPostNum
body = re.sub(''', "'", body)
for e in htmlentitydefs.entitydefs:
body = re.sub('&'+e+';', htmlentitydefs.entitydefs[e], body)
getDate = re.compile( radioRoot + '/([\d/]+).html' )
dateMatch = re.search(getDate,body)
if dateMatch:
dateTxt = dateMatch.groups()[0]
newUrl = "/archives/%06d.html" % dayToPostNumMap[ dateTxt ]
body = re.sub( radioRoot + '/(.*).html[^"]*',newUrl,body)
inPat = re.compile( '^<P>(.*)</P>$', re.S|re.M|re.I )
inPara = inPat.search(body)
if inPara:
body = inPara.groups()[0]
inPat = re.compile( '<P>(.*)</P>$', re.S|re.M|re.I )
inPara = inPat.search(body)
if inPara:
body = inPara.groups()[0]
result = ""
if author != "":
result += "AUTHOR: " + author + "\n"
if fDoTitle:
title = extractTitleFromBody( body )
result += "TITLE: " + title + "\n"
result += genTxtWithFlag( "ALLOW COMMENTS: ", fAllowComments );
result += genTxtWithFlag( "CONVERT BREAKS: ", fConvertBreaks );
result += genTxtWithFlag( "ALLOW PINGS: ", fAllowPings );
result += "PRIMARY CATEGORY:" + primaryCategory + "\n"
result += "DATE:" + date + "\n"
result += "-----\n"
result += "BODY:\n"
result += body + "\n"
result += "--------\n"
return result
archiveDir = postArchivesDir
if len(sys.argv) > 1:
archiveDir = sys.argv[1]
files = os.listdir( postArchivesDir )
files.sort()
res = ""
count = maxPostsToConvert
for f in files:
fileName = archiveDir + f
res += getOneEntryFromFile( fileName )
count -= 1
if 0 == count:
break
print res