#!/usr/bin/python from datetime import datetime import re import os from bs4 import BeautifulSoup from email.utils import parsedate_tz def san(unicode_str): ''' for sanitizing strings: - getting rid of all non-utf8 things - escape single quotations ''' s = unicode_str.encode('utf-8', 'ignore') s = s.replace("\\'", "'") # first unescape escaped ones s = re.sub('''(['"])''', r'\\\1', s) # now we escape all return s psql = open('posterous-import.sql', 'w') xmls = [xf for xf in os.listdir('posts') if xf.endswith('.xml')] for xf in xmls[:-2]: xfp = open('posts/{}'.format(xf), 'r') soup = BeautifulSoup(xfp.read()) title = soup.find('title').text tt = parsedate_tz(soup.find('pubdate').text) tdt = datetime(tt[0], tt[1], tt[2], tt[3], tt[4], tt[5]) sql_date = tdt.strftime('%Y-%m-%d %H-%M-%S') content = soup.find('content:encoded').text psql.write("INSERT INTO my_table \ (title, content, inputdate) VALUES \ ('{t}', '{c}', '{ed}');\n\n"\ .format(t=san(title), c=san(content), ed=sql_date)) psql.close()