How to Rename PDF Files with their Contents using python

How to Rename Multiple PDF Files with their Respective Contents using python

This solution contains the python code for how to rename PDF files with their contents using python and tkinter gui. If you are looking for a ready-made solution then you can check out this link – Free PDF Tool.

Install all the dependencies and import them

You can install all these dependencies by running the below command in the Terminal

pip install [library name]

E.g, pip install tkinter

from tkinter import *
from tkinter import messagebox, simpledialog
import tkinter.filedialog
import os
import PyPDF2
import re
import pikepdf
from pikepdf import _cpphelpers
import pdfplumber
import camelot
import glob
import win32com.client
import pandas as p
import csv
import xlsxwriter

root = Tk()
root.title('PDF Tool 1.0')
root.geometry('550x500')

How to Rename PDF Files with their Contents

def renamePDF():
    filepath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not filepath =='':
        search_text = tkinter.simpledialog.askstring(title='', prompt='Enter the text to be Searched')
        text_position = tkinter.simpledialog.askstring(title='', prompt='Enter the length of a string after the searched text')            
    for root, dirs, files in os.walk(filepath):
        for filename in files:
            basename, extention = os.path.splitext(filename)
            if extention == '.pdf':
                fullpath = root + '\\' + basename + extention
                open_pdf = pdfplumber.open(fullpath)
                page_obj = open_pdf.pages[0]
                pdf_text = page_obj.extract_text()
                for index in re.finditer(search_text, pdf_text):
                    doc_text = pdf_text[index.end():index.end() + int(text_position)]
                    doc_name = doc_text
                    doc_newname = re.sub(r"\s+","",doc_name)
                    open_pdf.close()
                    os.rename(fullpath, filepath + '\\' + doc_newname + '.pdf')   
    if not filepath =='':
        tkinter.messagebox.showinfo(title='', message="Files Renamed Successfully")

How to Split Multi-Page PDF File into One Page PDF File(s)

def createIndividualPDF():
    filepath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not filepath =='':
        outputPath = tkinter.filedialog.askdirectory(title='Select Ouput Folder')
    for root, dirs, files in os.walk(filepath):
        for filename in files:
            basename, extention = os.path.splitext(filename)
                fullpath = root + '\\' + basename + extention
                open_pdf = PyPDF2.PdfFileReader(open(fullpath, 'rb'))
                for i in range(open_pdf.numPages):
                    output = PyPDF2.PdfFileWriter()
                    output.addPage(open_pdf.getPage(i))
                    with open(outputPath + '\\' + filename + '-%s.pdf' % i, 'wb') as output_pdf:
                        output.write(output_pdf)
    if not filepath =='':
        tkinter.messagebox.showinfo(title='', message="Individual Files Created Successfully")

How to Remove Password (Single PDF File)

def removedPassword():
    filename = tkinter.filedialog.askopenfilename()
    if not filename =='':
        password = tkinter.simpledialog.askstring(title='Enter Password', prompt='')
        filepath = tkinter.filedialog.askdirectory(title='Select Folder to Save File')
    my_pdf = pikepdf.Pdf.open(filename, password=password)
    for page in my_pdf.pages:
        page.append
    my_pdf.save(filepath+'\\'+'newfilewithoutpasword.pdf')  
    if not filename =='':
        tkinter.messagebox.showinfo(title='', message="Password has been Removed Successfully")

This article is all about the python code for how to rename PDF files with their contents using python

How to Copy Multiple/Single PDF file(s) to Excel Sheet(s)

def pdftoTables():
    
    directoryPath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not directoryPath=='':
        input_page_nos = tkinter.simpledialog.askstring(title='Enter Page No.', prompt='E,g. 1,2,3 or'+'  all')
    #directory = filepath + '\*.pdf'
    os.chdir(directoryPath)
    folderLister = os.listdir(directoryPath)
    for folders, sub_folders, files in os.walk(directoryPath):
        for name in files:
            if name.endswith(".pdf"):
                filename = os.path.join(folders, name)
                tables = camelot.read_pdf(filename, pages=input_page_nos,flavor='stream')
                newname = name.split('.')[-2]
                csv_filename = filename.replace('.pdf', '.csv')
                with pd.ExcelWriter(directoryPath+'\\' + newname +'.xlsx', engine='xlsxwriter') as writer:
                    for x, i in enumerate(tables):
                        dfnew = i.df
                        dfnew.to_excel(writer,'sheet%s' % x, index=False, header=False)
                    writer.save()   
    if not directoryPath =='':
        tkinter.messagebox.showinfo(title='', message="Tables Extracted Successfully")

How to Merge Multiple PDF Files Into One

def mergePDFs():

    directory = tkinter.filedialog.askdirectory(title='Select Folder to Save File')
    directory_path = directory +'\*pdf'
    pdf = pikepdf.Pdf.new()

    for pdf_filepath in glob.glob(directory_path):
        src = pikepdf.Pdf.open(pdf_filepath)
        pdf.pages.extend(src.pages)

    pdf.save(directory +'\\' +'newmergedfile.pdf')        
    if not directory == '':
        tkinter.messagebox.showinfo(title='', message="Files Merged Successfully")

How to Convert Multiple/Single PDF file(s) to Word

def pdfToword():
    
    word = win32com.client.Dispatch("Word.Application")
    word.visible = 1

    pdfs_path = tkinter.filedialog.askdirectory()  # folder where the .pdf files are stored

    if not pdfs_path =='':
        for i, doc in enumerate(glob.iglob(pdfs_path + "\*.pdf")):
            filename = doc.split("\\")[-1]
            in_file = os.path.abspath(doc)
            wb=word.Documents.Open(in_file)
            out_file = os.path.abspath( pdfs_path + '\\'+ filename[0:-4]+ ".docx".format(i))
            wb.SaveAs2(out_file, FileFormat=16) # file format for docx
            wb.Close()
        word.Quit()
    if not pdfs_path == '':
        tkinter.messagebox.showinfo(title='', message="Files Converted to Word Successfully")

Complete Python Code

#Code by Talentnett Solutions
from tkinter import *
from tkinter import messagebox, simpledialog
import tkinter.filedialog
import os
import PyPDF2
import re
import pikepdf
from pikepdf import _cpphelpers
import pdfplumber
import camelot
import glob
import win32com.client
import pandas as p
import csv
import xlsxwriter

root = Tk()
root.title('PDF Tool 1.0')
root.geometry('550x500')

def createIndividualPDF():
    filepath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not filepath =='':
        outputPath = tkinter.filedialog.askdirectory(title='Select Ouput Folder')
    for root, dirs, files in os.walk(filepath):
        for filename in files:
            basename, extention = os.path.splitext(filename)
             fullpath = root + '\\' + basename + extention
                open_pdf = PyPDF2.PdfFileReader(open(fullpath, 'rb'))
                for i in range(open_pdf.numPages):
                    output = PyPDF2.PdfFileWriter()
                    output.addPage(open_pdf.getPage(i))
                    with open(outputPath + '\\' + filename + '-%s.pdf' % i, 'wb') as output_pdf:
                        output.write(output_pdf)
    if not filepath =='':
        tkinter.messagebox.showinfo(title='', message="Individual Files Created Successfully")
    
def renamePDF():
    filepath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not filepath =='':
        search_text = tkinter.simpledialog.askstring(title='', prompt='Enter the text to be Searched')
        text_position = tkinter.simpledialog.askstring(title='', prompt='Enter the length of a string after the searched text')            
    for root, dirs, files in os.walk(filepath):
        for filename in files:
            basename, extention = os.path.splitext(filename)
            if extention == '.pdf':
                fullpath = root + '\\' + basename + extention
                open_pdf = pdfplumber.open(fullpath)
                page_obj = open_pdf.pages[0]
                pdf_text = page_obj.extract_text()
                for index in re.finditer(search_text, pdf_text):
                    doc_text = pdf_text[index.end():index.end() + int(text_position)]
                    doc_name = doc_text
                    doc_newname = re.sub(r"\s+","",doc_name)
                    open_pdf.close()
                    os.rename(fullpath, filepath + '\\' + doc_newname + '.pdf')   
    if not filepath =='':
        tkinter.messagebox.showinfo(title='', message="Files Renamed Successfully")
    
def removedPassword():
    filename = tkinter.filedialog.askopenfilename()
    if not filename =='':
        password = tkinter.simpledialog.askstring(title='Enter Password', prompt='')
        filepath = tkinter.filedialog.askdirectory(title='Select Folder to Save File')
    my_pdf = pikepdf.Pdf.open(filename, password=password)
    for page in my_pdf.pages:
        page.append
    my_pdf.save(filepath+'\\'+'newfilewithoutpasword.pdf')  
    if not filename =='':
        tkinter.messagebox.showinfo(title='', message="Password has been Removed Successfully")

def pdftoTables():
    
    directoryPath = tkinter.filedialog.askdirectory(title='Select File Folder')
    if not directoryPath=='':
        input_page_nos = tkinter.simpledialog.askstring(title='Enter Page No.', prompt='E,g. 1,2,3 or'+'  all')
    #directory = filepath + '\*.pdf'
    os.chdir(directoryPath)
    folderLister = os.listdir(directoryPath)
    for folders, sub_folders, files in os.walk(directoryPath):
        for name in files:
            if name.endswith(".pdf"):
                filename = os.path.join(folders, name)
                tables = camelot.read_pdf(filename, pages=input_page_nos,flavor='stream')
                newname = name.split('.')[-2]
                csv_filename = filename.replace('.pdf', '.csv')
                with pd.ExcelWriter(directoryPath+'\\' + newname +'.xlsx', engine='xlsxwriter') as writer:
                    for x, i in enumerate(tables):
                        dfnew = i.df
                        dfnew.to_excel(writer,'sheet%s' % x, index=False, header=False)
                    writer.save()   
    if not directoryPath =='':
        tkinter.messagebox.showinfo(title='', message="Tables Extracted Successfully")

def mergePDFs():

    directory = tkinter.filedialog.askdirectory(title='Select Folder to Save File')
    directory_path = directory +'\*pdf'
    pdf = pikepdf.Pdf.new()

    for pdf_filepath in glob.glob(directory_path):
        src = pikepdf.Pdf.open(pdf_filepath)
        pdf.pages.extend(src.pages)

    pdf.save(directory +'\\' +'newmergedfile.pdf')        
    if not directory == '':
        tkinter.messagebox.showinfo(title='', message="Files Merged Successfully")

def pdfToword():
    
    word = win32com.client.Dispatch("Word.Application")
    word.visible = 1

    pdfs_path = tkinter.filedialog.askdirectory()  # folder where the .pdf files are stored

    if not pdfs_path =='':
        for i, doc in enumerate(glob.iglob(pdfs_path + "\*.pdf")):
            filename = doc.split("\\")[-1]
            in_file = os.path.abspath(doc)
            wb=word.Documents.Open(in_file)
            out_file = os.path.abspath( pdfs_path + '\\'+ filename[0:-4]+ ".docx".format(i))
            wb.SaveAs2(out_file, FileFormat=16) # file format for docx
            wb.Close()
        word.Quit()
    if not pdfs_path == '':
        tkinter.messagebox.showinfo(title='', message="Files Converted to Word Successfully")
    

button1 = Button(root, text='Split Multi-Page PDF File into One Page PDF File(s)', width = 70,command=createIndividualPDF, bg='#ffecb3')
button2 = Button(root, text='Rename Multiple PDF Files with their Respective Contents', width = 70,bg ='#f9fbe7',command=renamePDF)
button3 = Button(root, text='Remove Password (Single PDF File)', width = 70,bg ='lightblue',command=removedPassword)
button4 = Button(root, text='Copy Multiple/Single PDF file(s) to Excel Sheet(s)', width = 70,bg ='#80cbc4',command=pdftoTables)
button5 = Button(root, text='Merge Multiple PDF Files Into One',width = 70,bg ='#ede7f6',command=mergePDFs)
button6 = Button(root, text='Convert Multiple/Single PDF file(s) to Word', width = 70,bg ='#90caf9',command=pdfToword)

text1 = "Don't "

label1 = Label(root, text='Note:')
label2 = Label(root, text='1. Remove password if the file is protected before processing'
'\n2. Always make a copy of your original file before processing'
'\n     3. Rename Function will work only if each file has unique record')
label3 = Label(root, text='     4. Check on "'+text1+'show this message again" and Click on "OK" ')
label4 = Label(root, text='        in the "MS-Word" window when you use "PDF to Word" for the first time"')

createdby = Label(root,text= '- Developed by Umesh Agarwal')

button1.place(x=20, y=20)
button2.place(x=20, y=50)
button3.place(x=20, y=80)
button4.place(x=20, y=110)
button5.place(x=20, y=140)
button6.place(x=20, y=170)

label1.place(x=20, y=210)
label2.place(x=15, y=230)
label3.place(x=15, y=277)
label4.place(x=15, y=294)

createdby.place(x=350, y=450)

root.mainloop()

Read More:

COMMENTS