#!/usr/bin/python

from datetime import datetime
import re
import os
from bs4 import BeautifulSoup
from email.utils import parsedate_tz


def san(unicode_str):
    '''
    for sanitizing strings:
     - getting rid of all non-utf8 things
     - escape single quotations
    '''
    s = unicode_str.encode('utf-8', 'ignore')
    s = s.replace("\\'", "'") # first unescape escaped ones
    s = re.sub('''(['"])''', r'\\\1', s) # now we escape all
    return s


psql = open('posterous-import.sql', 'w')

xmls = [xf for xf in os.listdir('posts') if xf.endswith('.xml')]
for xf in xmls[:-2]:
    xfp = open('posts/{}'.format(xf), 'r')
    soup = BeautifulSoup(xfp.read())
    
    title = soup.find('title').text
    tt = parsedate_tz(soup.find('pubdate').text)
    tdt = datetime(tt[0], tt[1], tt[2], tt[3], tt[4], tt[5])
    sql_date = tdt.strftime('%Y-%m-%d %H-%M-%S')
    content = soup.find('content:encoded').text
    
    psql.write("INSERT INTO my_table \
                (title, content, inputdate) VALUES \
                ('{t}', '{c}', '{ed}');\n\n"\
               .format(t=san(title), c=san(content),
                ed=sql_date))

psql.close()