#! /usr/bin/python3
# ===================================================================
# create combined pdf file from html files
#
# Based on: www.geeksforgeeks.org/python-convert-html-pdf/
# Documentation: pdfkit.org/docs/guide.pdf
#
# Based on: stackoverflow.com/questions/3444645/merge-pdf-files
#
# look at...
# www.programcreek.com/python/example/100586/pdfkit.from_file
# ===================================================================
import pdfkit
from PyPDF2 import PdfFileMerger
import os, sys
options = {
'page-size': 'Letter',
'margin-top': '0.5in',
'margin-right': '0.5in',
'margin-bottom': '0.5in',
'margin-left': '0.5in',
'encoding': "UTF-8",
'custom-header' : [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'no-outline': None
}
htmlfls = [ './test/index.html',
'./test/project_001.html',
'./test/project_002.html',
'./test/project_003.html',
'./test/project_004.html',
'./test/project_005.html',
'./test/project_006.html',
'./test/project_008.html',
'./test/project_010.html',
'./test/project_011.html',
'./test/project_015.html',
'./test/project_016.html',
'./test/project_017.html',
'./test/project_018.html',
'./test/project_019a.html',
'./test/project_019b.html',
'./test/project_019.html',
'./test/project_020.html',
'./test/project_021.html',
'./test/project_022.html',
'./test/project_100.html',
'./test/project_507.html' ]
# -------------------------------------------------------------------
# ---- function: file exists
# -------------------------------------------------------------------
def file_exists(f):
if os.path.exists(f):
return True
print(f'The file {f} does not exist')
return False
# -------------------------------------------------------------------
# ---- function: delete files
# -------------------------------------------------------------------
def delete_files(lst):
cnt = 0 # file count
for f in lst:
if file_exists(f):
os.remove(f)
cnt += 1
else:
print(f'The file {f} does not exist')
return (False,cnt)
return (True,cnt)
# -------------------------------------------------------------------
# ---- function: merge pdf files into a single file
# -------------------------------------------------------------------
def merge_pdf_files(pdfs,outfile='merged.pdf'):
cnt = 0 # pdf file count
merger = PdfFileMerger()
for f in pdfs:
merger.append(f)
cnt += 1
merger.write(outfile)
merger.close()
return cnt
# -------------------------------------------------------------------
# ---- function: create pdf files from html files
# -------------------------------------------------------------------
def create_pdf_from_html(lst):
cnt = 0 # file count
pdfs = [] # pdf file names
for f in lst:
cnt += 1 # count input files
print(f'[{cnt:03}] {f}') # display count and input file
if not file_exists(f): # file exists?
return (pdfs,False)
# ---- convert html file to pdf file
# ---- use try...erxcept... to skip errors
# ---- there is a bug in 'pdfkit' that seems to have
# ---- no effect on the outout file but crashes
# ---- the program
ff = f'./file_{cnt:03}.pdf' # output file
try:
pdfkit.from_file(f, ff, options=options)
except:
pass
pdfs.append(ff) # save output file names for
# later processing
return (pdfs,cnt)
# -------------------------------------------------------------------
# ---- function: get list files in directory
# -------------------------------------------------------------------
def get_list_files_in_dir(directory):
files = os.listdir(directory)
return files
# -------------------------------------------------------------------
# ---- function: display list files in directory
# -------------------------------------------------------------------
def display_files_in_dir(directory):
files = get_list_files_in_dir(directory)
print()
print('---files-in-dir----------------------------------')
for f in files:
print(f)
print('-------------------------------------------------')
# -------------------------------------------------------------------
# ---- main
# -------------------------------------------------------------------
if __name__ == '__main__':
print()
print('---- create initial pdf files from html files')
(pdfs,cnt) = create_pdf_from_html(htmlfls)
print(f'created {cnt} pdf files')
print()
print('---- merge pdf files into a single pdf file')
cnt = merge_pdf_files(pdfs)
print(f'merged {cnt} pdf files')
print()
print('---- delete initial pdf files')
(tf,cnt) = delete_files(pdfs)
print(f'deleted {cnt} initial pdf files')
# ---- exit program
print()