For the longest time, I've wanted to search PDF files from the command line. Now, I can with pdfsearch.py! This program uses pyPdf to look for pages containing strings that match the provided regex, but due to the messy output of pyPdf, it won't print the matching lines.
pdfsearch.py
#!/usr/bin/python
from optparse import OptionParser
from sys import argv
from pyPdf import PdfFileReader
from os import walk
from re import compile as re_compile
from re import IGNORECASE
from os.path import join
parser = OptionParser(description = 'Search for text in PDF files.', usage = '%s [ options ] term [ file1..fileN ]' % argv[0])
parser.add_option('-i', '--insensitive', action = 'store_true', dest = 'insensitive', help = 'Search case-insensitively.', default = False)
def pdfgrep(expr, file):
pdf = None
with open(file, 'rb') as f:
pdf = PdfFileReader(f)
for i in xrange(pdf.getNumPages()):
content = pdf.getPage(i).extractText().strip()
if expr.search(content):
yield i
argv = [unicode(i, 'utf8') for i in argv]
options, args = parser.parse_args(argv[1:])
optionmap = {
'insensitive' : IGNORECASE
}
if len(args) >= 1:
term_flags = 0
for key, value in optionmap.iteritems():
if getattr(options, key):
term_flags |= value
term = re_compile(args[0], term_flags)
paths = ['.']
if len(args) >= 2:
paths = args[1:]
for path in paths:
for dirpath, dirnames, filenames in walk(path):
for filename in filenames:
if filename[-4:] == '.pdf':
fullfilename = join(dirpath, filename)
pages = list(pdfgrep(term, fullfilename))
if len(pages) > 0:
print u'%s:%s' % (fullfilename, u', '.join([str(i + 1) for i in pages]))
else:
parser.error('No search term provided.')
% python pdfsearch.py -i undo ../Notes/CSE444
../Notes/CSE444/2009-05-07:QuizSection_Midterm.pdf:4
../Notes/CSE444/PDF/lecture14.pdf:3, 6, 10, 11
../Notes/CSE444/PDF/lecture13.pdf:17, 18, 22
../Notes/CSE444/PDF/lecture09-10.pdf:2, 16, 31, 32, 36, 37, 39, 40, 41, 42, 43, 45, 48, 49, 58, 59, 60, 62, 63, 65
../Notes/CSE444/PDF/lecture11.pdf:30
No comments:
Post a Comment