I use a little python script to grab clippings out of my (early model) kindle’s My Clippings.txt file and import them into an sqlite3 database. The clips and notes are in a consistant format, so it’s possible to parse the file with some regular expressions, and a little logic. I found a script that did most of the work already, and converted it to send the clips to a database.

The sqlite3 database has a table for holding these clips. This is the basic structure of the table:

CREATE TABLE IF NOT EXISTS "clips" (
    `clipID`     INTEGER PRIMARY KEY AUTOINCREMENT,
    `bid`        INTEGER,
    `type`       TEXT,
    `location`   INTEGER NOT NULL,
    `text`       TEXT,
    `datestring` TEXT
);

For a little added efficiency, the import script will check for the latest date in the table and ignore any clips in My Clippings.txt from before that date.

This works well until my kindle runs out of charge completely and resets the date back to 1970. Then, because the kindle doesn’t really show the date anywhere, I don’t notice the problem for a while. (You’ll see there’s a little hack in the code that I use to grab those clips.)

There’s also a table for keeping track of the books themselves that’s required:

CREATE TABLE IF NOT EXISTS "books" (                                                          
    `id`        INTEGER,
    `book`      TEXT,
    `Title`     TEXT,                                                                        
    `SubTitle`  TEXT,
    `Author`    TEXT,
    PRIMARY KEY(`id`)
);
  • I think those are all the required fields. My version has some other fields for extra information.

grabClippings.py

  • requires zc.lockfile
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
from datetime import datetime
import sqlite3
from shutil import copyfile
import zc.lockfile

# EDIT THESE TWO FILEPATHS
DATABASE = u"/home/xxxx/Sync/SRS/kindleClips.sqlite3"
MYCLIPPINGS = u"/media/xxxx/Kindle/documents/My Clippings.txt"

BOUNDARY = u"==========\r\n"
TEMPFILE = u"/tmp/kindleClippings.txt"
book_ids = {}

def get_sections(filename):
    with open(filename, 'rb') as f:
        content = f.read().decode('utf-8')
    content = content.replace(u'\ufeff', u'')
    return content.split(BOUNDARY)


def get_clip(section):
    clip = {}

    lines = [l for l in section.split(u'\r\n') if l]
    if len(lines) != 3:
        return

    clip['book'] = lines[0]
    match = re.search(r'(\d+)-\d+', lines[1]) #Matches only highlights
    if not match:
        match = re.search(r'(\d+)', lines[1])
        if not match:
            return
    position = match.group(1)

    #Grab Date String
    dmatch = re.search(r'Added on (.*)$', lines[1])
    if dmatch:
        date = datetime.strptime(dmatch.group(1), '%A, %B %d, %Y, %I:%M %p')
    else:
        date = None

    if "- Highlight Loc." in lines[1]:
        clipType = "highlight"
    elif "- Note Loc." in lines[1]:
        clipType = "note"
    else:
        clipType = "bookmark"

    clip['position'] = int(position)
    clip['content'] = lines[2]
    clip['type'] = clipType
    clip['date'] = date
    clip['bid'] = get_book_id(clip['book'])

    return clip


def sql_file(clip):
    """
    Export clip to sql file if unique
    """
    #print("adding clip to database: %s" % clip)
    #Add to sql file
    if clip['content']:
        # check if exists
        c.execute("select count(clipID) from clips where bid = ? and type = ? and location = ? and text = ? and datestring = ?", (clip['bid'],clip['type'],clip['position'],clip['content'], str(clip['date'])))
        ret = c.fetchone()
        if not ret[0]:
            # insert
            c.execute("insert into clips (bid, type, location, text, datestring) values (?,?,?,?,?)", (clip['bid'], clip['type'], clip['position'], clip['content'], str(clip['date'])))
    else:
        # check if it exists
        c.execute("select count(clipID) from clips where bid = ? and type = ? and location = ? and  datestring = ?)", (clip['bid'], clip['type'], clip['position'], str(clip['date'])))
        ret = c.fetchone()
        if not ret[0]:
            # insert
            c.execute("insert into clips (bid, type, location, datestring) values (?,?,?,?)", (clip['bid'], clip['type'], clip['position'], str(clip['date'])))

def find_most_recent():
    c.execute("select datestring from clips order by datetime(datestring) DESC limit 1")
    result = c.fetchone()
    if result:
        lastdate = datetime.strptime(result[0], '%Y-%m-%d %H:%M:%S')
    else:
        lastdate = datetime.strptime('1900-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
    return lastdate

def add_book(book):
    #find title, subtitle, and author
    title = None
    subtitle = None
    author = None
    if '(' in book:
        author = book.split('(')[1].split(')')[0].strip()
        title = book.split('(')[0].strip()
    if title and '-' in title:
        subtitle = title.split('-')[1].strip()
        title = title.split('-')[0].strip()
    if not subtitle and title and ':' in title:
        subtitle = title.split(':')[1].strip()
        title = title.split(':')[0].strip()
    #insert
    c.execute("insert into books (book, Title, SubTitle, Author) values (?, ?, ?, ?)", (book, title, subtitle, author))
    conn.commit()

def get_book_id(book):
    global book_ids
    bid = None
    if book in book_ids and book_ids[book]:
        return book_ids[book]
    while not bid:
        c.execute("select id from books where book = ?", (book,))
        result = c.fetchone()
        if result:
            bid = result[0]
            book_ids[book] = bid
            return bid
        else:
            print('We need to add the book')
            add_book(book)

def main():
    global book_ids
    # extract clips
    mostRecent = find_most_recent()
    #mostRecent = datetime.strptime('1971-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') #hack
    # Hack above: when date gets screwed up on kindle, year reverts to 1970, so 
    #             highlights should all be before 1971. Change the > sign in date
    #             comparison 'if' statement below to < to use.
    print("Most Recent in Database: %s" % mostRecent)
    sections = get_sections(TEMPFILE)
    for section in sections:
        clip = get_clip(section)
        if clip:
            #print(clip['date'])
            if clip['date'] and clip['date'] > mostRecent:
                sql_file(clip)

if __name__ == '__main__':
    try:
        lock = zc.lockfile.LockFile('/tmp/grabClippings.lock')
    except zc.lockfile.LockError:
        print("Can't create lock file")
        exit()
    #copy clippings file
    if os.path.isfile(MYCLIPPINGS):
        copyfile(MYCLIPPINGS, TEMPFILE)
    if not os.path.isfile(TEMPFILE):
        print("Temp file does not exist")
        exit()
    conn = sqlite3.connect(DATABASE)
    c = conn.cursor()
    main()

    conn.commit()
    conn.close()
    lock.close()