forked from TrisSherliker/buntool
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbundle.py
More file actions
executable file
·2264 lines (2078 loc) · 101 KB
/
bundle.py
File metadata and controls
executable file
·2264 lines (2078 loc) · 101 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# TODO
##############################################
## BUGS
##############################################
# GENERAL
# - [x] make sure temp files delete
# - [x] add cron job to server to cleanup every few mins
# - [x] draft docx indexing
# - [x] responsive index overflows at some breakpoints
# - [ ] Possible niggle: handling of filenames with multiple `.` characters in names, or none of them. Is the code depending too much on there being an extension to the file at all?
##############################################
## ROADMAP
##############################################
# Technical improvements
# - [ ] General error handling in functions of app.py (file saving, dir creation, csv reading/writing)
# - [ ] Validation of all strings passed through frontend
# - [ ] validation of csv data passed from frontend, check headers and columns.
# Features
# - [ ] Add ability to offset page numbers (start at N)
# - [ ] Convenience for sections: Add section header, spawn upload area for that section, helps to organise files
# - [ ] Add a write-metadata function: https://pypdf.readthedocs.io/en/stable/user/metadata.html
# - [ ] ability to reload state (via zip import).
# This would require --
# - [ ] save option state (as json?)
# - [ ] save csv
# - [ ] save input files
# - [ ] allow upload of zip which is then parsed out into options/csv/inputfiles
# - [ ] the data structure point above will help with this, because then it just becomes a matter of setting variables from the lines of the file.
# PDF manipulation
from pypdf import PdfReader, PdfWriter
from pypdf.annotations import Link
from pypdf.generic import Fit
from pikepdf import Pdf, OutlineItem, Dictionary, Name, PdfError
import pdfplumber
# reportlab stuff
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Spacer, Paragraph, Frame, PageBreak
from reportlab.lib import colors
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT, TA_CENTER, TA_RIGHT
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas
# from reportlab.rl_config import defaultPageSize
import reportlab.rl_config
from reportlab.rl_config import defaultPageSize
# custom
from makedocxindex import create_toc_docx
# General
import os
import re
import argparse
import shutil
import csv
import logging
import zipfile
import tempfile
from datetime import datetime
from werkzeug.utils import secure_filename
# Set globals
bundle_logger = logging.getLogger('bundle_logger')
session_file_handler = None
PAGE_HEIGHT = defaultPageSize[1];
PAGE_WIDTH = defaultPageSize[0] # reportlab page sizes used in more than one function
bundle_config = None
def configure_logger(session_id=None):
'''
Temp files are saved in /tmp/tempfiles/[session_id] (hardcoded in app.py)
where session_id is an 8-digit hex number.
Since the temp files are deleted in production,
logs are to be stored in a seprate file /tmp/logs.
'''
logs_dir = bundle_config.logs_dir
if not os.path.exists(logs_dir):
os.makedirs(logs_dir)
# Configure logging
global session_file_handler
global bundle_logger
bundle_logger = logging.getLogger('bundle_logger')
# Clear existing handlers to prevent duplicate logs on subsequent runs
if bundle_logger.hasHandlers():
bundle_logger.handlers.clear()
bundle_logger.setLevel(logging.DEBUG)
bundle_logger.propagate = False
formatter = logging.Formatter('%(asctime)s-%(levelname)s-[BUN]: %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
bundle_logger.addHandler(console_handler)
if not session_id:
session_id = datetime.now().strftime("%Y%m%d%H%M%S") # fallback
# logs path = buntool_timestamp.log:
logs_path = os.path.join(logs_dir, f"buntool_{session_id}.log")
session_file_handler = logging.FileHandler(logs_path)
session_file_handler.setLevel(logging.DEBUG)
session_file_handler.setFormatter(formatter)
bundle_logger.addHandler(session_file_handler)
return bundle_logger
def remove_temporary_files(list_of_temp_files):
'''
Run at the end of the bundle process.
During bundle processing a log of files is kept in a list.
This function takes that list and deletes the files one by one, logging the deletion.
This can't delete the output files themselves, but it's not a problem because
in production, a separate process will also flush files periodically.
'''
# Clean up temporary files
bundle_logger.debug(f"[CB]Cleaning up temporary files: {list_of_temp_files}")
remaining_files = [] # To track files that couldn't be deleted
for file in list_of_temp_files:
if os.path.exists(file):
bundle_logger.debug(f"[CB]..Deleting: {file}")
try:
os.remove(file)
if not os.path.exists(file):
bundle_logger.debug(f"[CB]....deleted.")
else:
bundle_logger.info(f"[CB]....could not delete {file}. File will be deleted in periodic cleanup.")
remaining_files.append(file)
except Exception as e:
bundle_logger.info(f"[CB]....Info: could not delete temporary file {file}. Error: {e}")
remaining_files.append(file)
else:
bundle_logger.info(f"[CB]....Info: Temporary file {file} does not exist, nothing to clean up.")
if remaining_files:
bundle_logger.info(f"[CB]..Remaining temporary files (will be deleted on next system flush): {remaining_files}")
else:
bundle_logger.info(f"[CB]..All temporary files deleted successfully.")
return remaining_files
def sanitise_latex(text):
'''
Homebrew LaTeX sanitiser.
Potential alternative available at: https://pythonhosted.org/latex/
which has an escape_latex function.
However, this is entirely unused in production: the LaTeX functionality
has been ported by ReportLab, which is more portable for deployment on AWS.
The LaTeX functions are maintained because they work well, look good, and I
sometimes prefer them for self-hosted use.
There's simple way of to enable LaTeX indexing. To make it work, replace calls
to reportlab style functions with calls to LaTeX functions, and alter the
values in the frontend 'index font' and 'footer font' form fields (in
buntool.js) to reference the expected font names which are used by LaTeX.
'''
replacements = {
u'_': u'\\_',
u'$': u'\\$',
u'%': u'\\%',
u'#': u'\\#',
u'{': u'\\{',
u'&': u'\\&',
u'}': u'\\}',
u'[': u'{[}',
u']': u'{]}',
u'"': u"{''}",
u"|": u'\\textbar{}',
u'\\': u'\\textbackslash{}',
u'~': u'\\textasciitilde{}',
u'<': u'\\textless{}',
u'>': u'\\textgreater{}',
u'^': u'\\textasciicircum{}',
u'`': u'{}`',
u'\n': u'\\\\',
}
# Remove emojis and other non-ASCII characters (ascii list from space 0x20 onwards)
text = re.sub(r'[^\x20-\x7F]+', '', text)
# replace awkward ascii characters with LaTeX commands:
sanitised_text = u''.join(replacements.get(c, c) for c in text)
bundle_logger.debug(f"[SL].... Sanitised input '{text}' for LaTeX output '{sanitised_text}'")
return sanitised_text
return text
def load_bundle_config(bundle_config_data):
global bundle_config
bundle_config = bundle_config_data
def parse_the_date(date):
'''
This function takes a date input in YYYY-MM-DD format and
formats it according to user preferences from the following
styles depending on state of date_setting:
- YYYY-MM-DD
- DD-MM-YYYY
- MM-DD-YYYY
- uk_longdate
- us_longdate
- uk_abbreviated_date
- us_abbreviated_date
or if setting is hide_date, don't do anything
'''
if bundle_config.date_setting == "hide_date":
return date
# check if date matches the expected format
if not re.match(r'\d{4}-\d{2}-\d{2}', date):
bundle_logger.error(f"[PTD] Error: Date does not match expected format: {date}")
return date
try:
parsed_date = datetime.strptime(date, '%Y-%m-%d')
formats = {
"YYYY-MM-DD": "%Y-%m-%d",
"DD-MM-YYYY": "%d/%m/%Y",
"MM-DD-YYYY": "%m/%d/%Y",
"uk_longdate": "%d %B %Y",
"us_longdate": "%B %d, %Y",
"uk_abbreviated_date": "%d %b %Y",
"us_abbreviated_date": "%b %d, %Y"
}
return parsed_date.strftime(formats[bundle_config.date_setting])
except KeyError:
bundle_logger.error(f"[PTD] Error: Unknown date setting: {bundle_config.date_setting}")
return date
def load_index_data(csv_index):
'''
This ingests a CSV of table-of-contents entries, and returns
a dictionary of the data (in the create bundle function, saved as
index_data). The resulting dictionary is the template for the whole
bundle creation.
CSV is typically generated by the frontend and is expected to be
properly formatted as follows:
Headings:
filename, userdefined_title, date, section
where 'section' is a section-marker flag.
for normal input files:
[filename, title, date, 0]
for section breaks:
[SECTION, section_name,,1]
There are some fallbacks in place in case the data is missing, but
this should not happen. They are there mainly for testing purposes
when using the code via CLI.
'''
index_data = {}
bundle_logger.debug(f"[LID]Loading index data from {csv_index}")
with open(csv_index, 'r', newline='') as f:
reader = csv.reader(f)
next(reader) # Skip header row
nil = "0"
for row in reader:
if len(row) >= 4:
filename, userdefined_title, raw_date, section = row
formatted_date = parse_the_date(raw_date)
# Store filename as provided by frontend
index_data[filename] = (userdefined_title, formatted_date, section)
elif len(row) == 3:
filename, userdefined_title, raw_date = row
formatted_date = parse_the_date(raw_date)
index_data[filename] = (userdefined_title, formatted_date, '')
else:
filename, userdefined_title = row
bundle_logger.debug(f"Reading file entry: |{filename}|")
index_data[filename] = (userdefined_title, '', '')
bundle_logger.debug(f"[LID]..Loaded index data with {len(index_data)} entries:")
for k, v in index_data.items():
bundle_logger.debug(f"[LID]....Key: |{k}| -> Value: {v}")
return index_data
def get_pdf_creation_date(file):
'''
Extracts the creation date from a PDF file.
This is purely a fallback function in case the
user-supplied (or frontend-supplied) information is missing a date.
'''
try:
with Pdf.open(file) as pdf:
creation_date = pdf.docinfo.get('/CreationDate', None)
if creation_date:
# Convert to string if it's a pikepdf.String object
creation_date_str = str(creation_date)
# Extract date in the format D:YYYYMMDDHHmmSS
date_str = creation_date_str[2:10]
date_obj = datetime.strptime(date_str, '%Y%m%d')
return date_obj.strftime('%d.%m.%Y')
except Exception as e:
bundle_logger.error(f"[GPCD]Error extracting creation date from {file}: {e}")
creation_date = None
return None
def merge_pdfs_create_toc_entries(input_files, output_file, index_data):
'''
Two jobs at once.
index_data is the roadmap for the bundle creation.
1. Merge the PDFs in input_files into a single PDF at output_file.
2. Create a table of contents from the index_data, and return it.
The table of contents is based on the index_data and the structural
results of merging the files together.
It outputs a list of tuples, toc_entries each containing:
- tab number
- title
- date
- page number
'''
pdf = Pdf.new()
page_count = 0
toc_entries = []
tab_count = 1
section_count = 1
# Iterate through the lines of index data
for filename, (title, date, section) in index_data.items():
if section == "1":
# Sections are easy
toc_entries.append((f"SECTION_BREAK_{section_count}", title))
section_count += 1
else:
try:
# Files are more complex. They require:
# - Set tab number:
tab_number = f"{tab_count:03}."
tab_count += 1
try:
# - Count pages:
# Filename just has the base name, but input_files has the full path. Use the full path:
this_file_path = None
for input_file_path in input_files:
if os.path.basename(filename) in input_file_path:
if not os.path.exists(input_file_path):
bundle_logger.debug(f"[MPCTE]..Error: File {filename} not found at {input_file_path}")
break
else:
bundle_logger.debug(f"[MPCTE]..File {filename} found at {input_file_path}")
this_file_path = input_file_path
break
if not this_file_path:
bundle_logger.debug(f"[MPCTE]File {filename} not found in input_files.")
continue
src = Pdf.open(this_file_path)
page_count += len(src.pages)
pdf.pages.extend(src.pages)
bundle_logger.debug(f"[MPCTE]....added to merged PDF")
except Exception as e:
bundle_logger.debug(f"Error counting pages in {os.path.basename(this_file_path)}: {e}")
continue
# - Add to outline:
if index_data and os.path.basename(filename) in index_data:
bundle_logger.debug(f"[MPCTE]..found index data")
title, date, section = index_data[os.path.basename(filename)]
else:
title = os.path.splitext(os.path.basename(filename))[0]
date = get_pdf_creation_date(filename)
section = None
date = date or "Unknown"
bundle_logger.debug(f"[MPCTE]..Not in index. Using alternative data: Title: {title}, Date: {date}")
bundle_logger.debug(f"[MPCTE]..Adding toc entry: {tab_number}, {title}, {page_count - len(src.pages)}")
toc_entries.append((tab_number, title, date, page_count - len(src.pages)))
except Exception as e:
bundle_logger.debug(f"[MPCTE] Error merging and creating toc entries for {filename}: {e}")
raise e
continue
pdf.save(output_file)
return toc_entries
def add_bookmarks_to_pdf(pdf_file, output_file, toc_entries, length_of_frontmatter):
'''
This is about adding outline entries ('bookmarks') to a PDF for
navigation.
It reads the digested toc_entries and adds an outline item for each.
Due to loose naming conventions this can be confusing, so to be clear:
- It does not bookmark the index itself (that's the job of bookmark_the_index).
- It does not add on-page hyperlinks (that's add_hyperlinks)
The content of the entry will depend on bookmark_setting from options:
"tab-title" (default)
"tab-title-date"
"tab-title-page"
"tab-title-date-page
'''
with Pdf.open(pdf_file) as pdf:
with pdf.open_outline() as outline:
for entry in toc_entries:
if "SECTION_BREAK" in entry[0]: # ignore section entries
continue
if "tab" in entry[0].lower() and "title" in entry[1].lower() and "page" in entry[3].lower():
continue
else:
tab_number, title, date, page = entry
if bundle_config.bookmark_setting == "tab-title":
item = OutlineItem(f"{tab_number} {title}", page + length_of_frontmatter)
elif bundle_config.bookmark_setting == "tab-title-date":
item = OutlineItem(f"{tab_number} {title} ({date})", page + length_of_frontmatter)
elif bundle_config.bookmark_setting == "tab-title-page":
item = OutlineItem(f"{tab_number} {title} [pg.{1+ page + length_of_frontmatter}]", page + length_of_frontmatter)
elif bundle_config.bookmark_setting == "tab-title-date-page":
item = OutlineItem(f"{tab_number} {title} ({date}) [pg.{1 + page + length_of_frontmatter}]", page + length_of_frontmatter)
else:
bundle_logger.error(f"[ABTP]Error: Unknown bookmark_setting: {bundle_config.bookmark_setting}")
item = OutlineItem(f"{tab_number} {title}", page + length_of_frontmatter)
outline.root.append(item)
pdf.save(output_file)
def merge_frontmatter(input_files, output_file):
'''
Function to merge uploaded coversheet + generated index, in cases
where coversheet is specified. The resulting frontmatter is pre-pended
to the main bundle.
'''
pdf = Pdf.new()
for input_file in input_files:
with Pdf.open(input_file) as src:
pdf.pages.extend(src.pages)
pdf.save(output_file)
return output_file
def bookmark_the_index(pdf_file, output_file, coversheet=None):
'''
the function add_bookmarks_to_pdf adds an outline item for each input file,
but it cannot bookmark the index itself because it takes place earlier in the
order of processing.
This function comes back for a second pass and adds an outline item for the
index.
'''
with Pdf.open(pdf_file) as pdf:
with pdf.open_outline() as outline:
if coversheet:
# test length of coversheet and set coversheet_length to the number of pages:
with Pdf.open(coversheet) as coversheet_pdf:
coversheet_length = len(coversheet_pdf.pages)
# Add an outline item for "Index" linking to the first page after the coversheet (it's 0-indexed):
index_item = OutlineItem("Index", coversheet_length)
outline.root.insert(0, index_item)
bundle_logger.debug("[BTI]coversheet is specified, outline item added for index")
else:
# Add an outline item for "Index" linking to the first page:
index_item = OutlineItem("Index", 0)
outline.root.insert(0, index_item)
bundle_logger.debug("[BTI]no coversheet specified, outline item added for index")
pdf.save(output_file)
def create_toc_pdf_reportlab(
toc_entries,
casedetails,
output_file,
confidential=False,
date_setting="YYYY-MM-DD",
index_font_setting=None,
dummy=False,
frontmatter_offset=0,
length_of_coversheet=0,
page_num_alignment=None,
page_num_font=None,
page_numbering_style=None,
footer_prefix=None,
main_page_count=0,
roman_numbering=False
):
'''
The first version of buntool generated the index file and
page numbering with LaTeX, but LaTeX is a complicated dependency
for generating this sort of thing. So this is a refactored
version that uses ReportLab.
This function is a drop-in replacement for the earlier version,
create_toc_pdf_tex. The LaTeX chain is preserved for personal
local usage (I just like it), though is unused in this code.
And, while some of the arguments aren't used in this version,
it preserves the structure of the LaTeX version for maintainability.
The major difference is how footer page numbers are generated: this calls
a separate function, reportlab_footer_config, which is a page configuration.
Approach:
- parse options
- set up fonts and styles
- add tables for the header matter
- generate a long table for the main table of contents, which can
flow across pages,
- build the PDF
'''
# First, parse out the arguments.
# index font setting.
if index_font_setting == 'serif':
main_font = 'Times-Roman'
bold_font = 'Times-Bold'
base_font_size = 12
elif index_font_setting == 'sans':
main_font = 'Helvetica'
bold_font = 'Helvetica-Bold'
base_font_size = 12
elif index_font_setting == 'mono':
main_font = 'Courier'
bold_font = 'Courier-Bold'
base_font_size = 10
elif index_font_setting == 'traditional':
main_font = 'Charter_regular'
bold_font = 'Charter_bold'
base_font_size = 12
else: # defailt to Helvetica
main_font = 'Helvetica'
bold_font = 'Helvetica-Bold'
base_font_size = 12
if date_setting == "hide_date": # if date disabled: keep the column, just make it small and blank out the header
date_col_hdr = ""
date_col_width = 0
title_col_width = 11.5 # These ints will later be used with the cm unit
page_col_width = 2.5
elif date_setting == "YYYY-MM-DD" or date_setting == "DD-MM-YYYY" or date_setting == "MM-DD-YYYY" or date_setting == "uk_abbreviated_date" or date_setting == "us_abbreviated_date":
date_col_hdr = "Date"
date_col_width = 3.2
title_col_width = 9.8
page_col_width = 1.7
elif date_setting == "uk_longdate" or date_setting == "us_longdate" :
date_col_hdr = "Date"
date_col_width = 4.2
title_col_width = 8.8
page_col_width = 1.7
else:
date_col_hdr = "Date"
date_col_width = 3.5
title_col_width = 9.5
page_col_width = 1.7
if dummy:
page_offset = 0
else:
page_offset = frontmatter_offset + 1
# Now on to reportlab formatting. First, font and stylesheet wrangling
reportlab_pdf = SimpleDocTemplate(output_file, pagesize=A4, rightMargin=1.5 * cm, leftMargin=1.5 * cm,
topMargin=1 * cm, bottomMargin=1.5 * cm)
# Register non-standard fonts.
pdfmetrics.registerFont(TTFont('Charter_regular', 'Charter_Regular.ttf'))
pdfmetrics.registerFont(TTFont('Charter_bold', 'Charter_Bold.ttf'))
pdfmetrics.registerFont(TTFont('Charter_italic', 'Charter_Italic.ttf'))
reportlab.rl_config.warnOnMissingFontGlyphs = 0
# Set up stylesheet for the various styles used.
styleSheet = getSampleStyleSheet()
main_style = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=main_font,
fontSize=base_font_size,
leading=14
)
main_style_right = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=main_font,
fontSize=base_font_size,
leading=14,
alignment=TA_RIGHT
)
bold_style = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=bold_font,
fontSize=base_font_size,
leading=14
)
claimno_style = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=bold_font,
fontSize=base_font_size,
leading=14,
alignment=TA_RIGHT
)
bundle_title_style = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=bold_font,
fontSize=base_font_size + 6,
leading=14,
alignment=TA_CENTER
)
case_name_style = ParagraphStyle(
'BodyText',
parent=styleSheet['Normal'],
fontName=bold_font,
fontSize=base_font_size + 2,
leading=14,
alignment=TA_CENTER
)
styleSheet.add(ParagraphStyle(name='main_style', parent=main_style))
styleSheet.add(ParagraphStyle(name='main_style_right', parent=main_style_right))
styleSheet.add(ParagraphStyle(name='bold_style', parent=bold_style))
styleSheet.add(ParagraphStyle(name='claimno_style', parent=claimno_style))
styleSheet.add(ParagraphStyle(name='bundle_title_style', parent=bundle_title_style))
styleSheet.add(ParagraphStyle(name='case_name_style', parent=case_name_style))
# styleSheet.add(ParagraphStyle(name='footer_style', parent=footer_style))
# Now, position each element within a table.
# There are three tables: Claim no, [Case title, bundle title], and [toc_entries]
# Each table is defined by:
# - define data to go into the table;
# - define the table itself; and
# - set the style of the table.
# Finally, they are passed as elements to the builder function.
# Claim No table - top right
claimno_table_data = [
[Paragraph(casedetails[1], claimno_style)], # Claim No
]
claimno_table = Table(data=claimno_table_data,
colWidths=PAGE_WIDTH * 0.9,
rowHeights=1.5 * cm
)
claimno_table.setStyle(TableStyle([
('LEFTPADDING', (0, 0), (-1, -1), 0),
('RIGHTPADDING', (0, 0), (-1, -1), 50),
('TOPPADDING', (0, 0), (-1, -1), 0),
('BOTTOMPADDING', (0, 0), (-1, -1), 0),
('VALIGN', (0, 0), (-1, -1), 'BOTTOM'),
# ('GRID', (0, 0), (-1, -1), 0.5, 'black'),
]))
# Now, the case name and bundle title:
if not confidential:
header_table_data = [
["", Paragraph(casedetails[2], case_name_style), ""], # Case Name
["", Paragraph(casedetails[0], bundle_title_style), ""], # Bundle Title
]
else:
header_table_data = [
["", Paragraph(casedetails[2], case_name_style), ""], # Case Name
["", Paragraph((f"<font color=\"red\">CONFIDENTIAL</font> {casedetails[0]}"), bundle_title_style), ""],
# Bundle Title
]
header_table = Table(header_table_data,
colWidths=[PAGE_WIDTH / 8, PAGE_WIDTH * (6 / 8), PAGE_WIDTH / 8]) # aesthetic choice
header_table.setStyle(TableStyle([
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('ALIGN', (2, 0), (2, 0), 'RIGHT'), # Align Claim No to the right
('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
('SIZE', (0, 0), (-1, -1), 10),
('LINEBELOW', (1, 1), (1, 1), 1, colors.black), # Underline Bundle Title
('LINEABOVE', (1, 1), (1, 1), 1, colors.black), # Overline Bundle Title
('TOPPADDING', (0, 0), (-1, -1), 8),
('BOTTOMPADDING', (0, 0), (-1, -1), 14),
]))
# Third, the main toc entries able:
reportlab_table_data = []
# Check whether header row already passed. If not, pre-pend header row for table:
if not ("Tab" in toc_entries[0][0] and "Title" in toc_entries[0][1] and "Date" in toc_entries[0][2] and "Page" in
toc_entries[0][3]):
toc_entries.insert(0, ["Tab", "Title", "Date", "Page"])
rowidx = 0
list_of_section_breaks = []
for row in toc_entries:
row = list(row) # convert from tuple to list to allow editing
new_row = []
if "Tab" in row[0] and "Title" in row[1] and "Date" in row[2] and "Page" in row[3]:
row[2] = date_col_hdr # This is set earlier, based on parsed date_setting
for cell in row:
new_row.append(Paragraph(cell, styleSheet['main_style']))
elif "SECTION_BREAK" in row[0]:
list_of_section_breaks.append(rowidx) # keep track of row numbers of section breaks for later formatting
row[0] = ""
for cell in row:
new_row.append(Paragraph(cell, styleSheet['bold_style']))
else:
if dummy:
row[3] = 9999; # dummy page number
else:
row[3] += page_offset
for cell in row:
if isinstance(cell, str):
new_row.append(Paragraph(cell, styleSheet['main_style']))
else: # page numbers are ints, so stringfy them:
string_cell = str(cell)
new_row.append(Paragraph(string_cell, styleSheet['main_style_right']))
rowidx += 1
reportlab_table_data.append(new_row)
toc_table = Table(reportlab_table_data,
colWidths=[1.3 * cm, title_col_width * cm, date_col_width * cm, page_col_width * cm],
repeatRows=1, cornerRadii=(5, 5, 0, 0))
style = TableStyle([
# Style for header row:
('BACKGROUND', (0, 0), (-1, 0), colors.darkgray),
('BOTTOMPADDING', (0, 0), (-1, 0), 8),
('LINEBELOW', (0, 0), (-1, 0), 1, colors.black),
# ('FONTNAME', (0, 0), (-1, 0Roman-), bold_fontname),
('ALIGNMENT', (0, 0), (-1, 0), 'CENTRE'),
('FONTSIZE', (0, 0), (-1, 0), 12),
# rest of table:
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('BACKGROUND', (0, 1), (-1, -1), colors.white),
('LINEBELOW', (0, 1), (-1, -1), 0.3, colors.black),
# paint section breaks with grey background:
])
for section_break_row in list_of_section_breaks:
style.add('BACKGROUND', (0, int(section_break_row)), (-1, int(section_break_row)), colors.lightgrey)
toc_table.setStyle(style)
# Now, add a footer with the page number. Use a single-cell table at the bottom of the page:
# current page number:
# footer_frame = Frame (
# PAGE_WIDTH*0.2, 1*cm, #x, y lower left
# PAGE_WIDTH*0.8, 1.5*cm, #box width and height
# leftPadding=6,
# bottomPadding=6,
# rightPadding=6,
# topPadding=6,
# id="footerframe",
# showBoundary=1
# )
# footer_frame.add("Blob", reportlab_pdf)
# Now, build the pdf:
elements = [claimno_table, header_table, Spacer(1, 1 * cm), toc_table]
if not bundle_config.roman_for_preface:
reportlab_pdf.build(elements, onFirstPage=reportlab_footer_config, onLaterPages=reportlab_footer_config)
else:
reportlab_pdf.build(elements)
def generate_footer_pages_reportlab(filename, num_pages):
"""
Generate a PDF with N blank pages, using onFirstPage and onLaterPages callbacks.
Args:
filename (str): The name of the output PDF file.
num_pages (int): Number of blank pages to create.
onFirstPage (callable): Callback for the first page.
onLaterPages (callable): Callback for subsequent pages.
page_size (tuple): Page size, defaults to A4.
"""
bundle_logger.debug(f"[GFP]Generating {num_pages} blank pages in {filename}")
# Create the document
doc = SimpleDocTemplate(
filename,
pagesize=A4,
)
# ReportLab protects against infinite loops by checking whether or not a
# page has content at build time, and terminates after 10 pages without
# content. It doesn't count footer content as content. So, it breaks when
# generating footer-only pages.
# Workaround: Since reportlab defines 'content' in this sense as anything
# which is a flowable, a workaround is to add an invisible flowable to each page.
annoying_blank_flowable = Paragraph("")
# Prepare blank pages with PageBreaks
story = []
for _ in range(num_pages):
story.append(annoying_blank_flowable) # workaround infinite loop issue
story.append(PageBreak()) # Add page breaks between blank pages
# Build the document with the footer config:
doc.build(story, onFirstPage=reportlab_footer_config, onLaterPages=reportlab_footer_config)
def reportlab_footer_config(canvas, doc):
'''
This is a page configuration function, and is called by
the other reportlab functions during their build process.
It's not used directly, and since it's internal to reportlab
it's easier to operate on global variables here.
'''
length_of_frontmatter_offset = bundle_config.expected_length_of_frontmatter if bundle_config.expected_length_of_frontmatter else 0
total_number_of_pages = bundle_config.total_number_of_pages if bundle_config.total_number_of_pages else 0
page_num_alignment = bundle_config.page_num_align if bundle_config.page_num_align else None
page_num_font = bundle_config.footer_font if bundle_config.footer_font else None
page_numbering_style = bundle_config.page_num_style if bundle_config.page_num_style else None
footer_prefix = bundle_config.footer_prefix if bundle_config.footer_prefix else ""
if page_num_font == 'serif':
footer_font = 'Times-Roman'
footer_base_font_size = 15
elif page_num_font == 'Helvetica':
footer_font = 'sans'
footer_base_font_size = 14
elif page_num_font == 'mono':
footer_font = 'Courier'
footer_base_font_size = 14
elif page_num_font == 'traditional':
footer_font = 'Charter_regular'
footer_base_font_size = 15
else: # defalt to Helvetica
footer_font = 'Helvetica'
footer_base_font_size = 14
canvas.saveState()
canvas.setFont('Times-Bold', 16)
if page_num_alignment == "left":
footer_style = ParagraphStyle(
'BodyText',
fontSize=footer_base_font_size,
fontName=footer_font,
# leading=14,
alignment=TA_LEFT
)
elif page_num_alignment == "right":
footer_style = ParagraphStyle(
'BodyText',
fontSize=footer_base_font_size,
fontName=footer_font,
# leading=14,
alignment=TA_RIGHT
)
elif page_num_alignment == "centre":
footer_style = ParagraphStyle(
'BodyText',
fontSize=footer_base_font_size,
fontName=footer_font,
# leading=14,
alignment=TA_CENTER
)
else:
footer_style = ParagraphStyle(
'BodyText',
fontSize=footer_base_font_size,
fontName=footer_font,
# leading=14,
alignment=TA_RIGHT
)
if footer_prefix:
footer_data = footer_prefix.strip() + " "
else:
footer_data = ""
# parse page numbering style and APPEND to the existing text above.
# NOTE: This feels a bit janky, but the same function is being used to
# make the footer for the TOC as is used for the main bundle.
# When generating the TOC, there is no need to offset page numbers.
# But when generating the main bundle, there is.
# There's not much control of the arguments to pass to this function, because
# it's called in the inner workings of ReportLab. The janky solution is that the
# offset parameter length_of_frontmatter_offset is a global parameter, initially
# set to 0 (at the time this is first called) and later set to the frontmatter length.
if page_numbering_style == "x":
footer_data += f"{canvas.getPageNumber() + length_of_frontmatter_offset}"
# bundle_logger.debug("[rplb]..Page numbering style: x")
elif page_numbering_style == "x_of_y":
footer_data += f"{canvas.getPageNumber()} of {str(total_number_of_pages)}"
# bundle_logger.debug("[rplb]..Page numbering style: x of y")
elif page_numbering_style == "page_x":
footer_data += f"Page {canvas.getPageNumber() + length_of_frontmatter_offset}"
# bundle_logger.debug("[rplb]..Page numbering style: Page x")
elif page_numbering_style == "page_x_of_y":
footer_data += f"Page {canvas.getPageNumber() + length_of_frontmatter_offset} of {str(total_number_of_pages)}"
# bundle_logger.debug("[rplb]..Page numbering style: Page x of y")
elif page_numbering_style == "x_slash_y":
footer_data += f"{canvas.getPageNumber() + length_of_frontmatter_offset} / {str(total_number_of_pages)}"
# bundle_logger.debug("[rplb]..Page numbering style: x / y")
else:
footer_data += f"Page {canvas.getPageNumber() + length_of_frontmatter_offset}"
# bundle_logger.debug("[rplb]..Defaulting to page numbering style: Page x")
footer_frame = Frame(
0, 0 * cm, # x, y lower left
PAGE_WIDTH, 1.5 * cm, # box width and height
leftPadding=50,
bottomPadding=0,
rightPadding=50,
topPadding=0,
id="footerframe",
showBoundary=1
)
footer_frame.hAlign = "RIGHT"
footer_frame.add(Paragraph(footer_data, footer_style), canvas)
# Draw filename header if enabled
if bundle_config.header_filename and bundle_config.page_to_filename:
current_page = canvas.getPageNumber() # 1-based within the overlay PDF
header_text = bundle_config.page_to_filename.get(current_page, "")
if header_text:
header_font_size = 8
header_font_name = footer_font # match the footer font choice
header_style = ParagraphStyle(
'HeaderText',
fontSize=header_font_size,
fontName=header_font_name,
textColor=colors.Color(0.4, 0.4, 0.4),
alignment=TA_LEFT
)
header_frame = Frame(
0, PAGE_HEIGHT - 1.2 * cm,
PAGE_WIDTH, 1.0 * cm,
leftPadding=50,
bottomPadding=0,
rightPadding=50,
topPadding=0,
id="headerframe",
showBoundary=0
)
header_frame.add(Paragraph(header_text, header_style), canvas)
def create_toc_pdf_tex(toc_entries, casedetails, output_file, confidential=False, date_setting=True,
index_font_setting=None, dummy=False, frontmatter_offset=0, length_of_coversheet=0,
page_num_alignment=None, page_num_font=None, page_numbering_style=None, footer_prefix=None,
main_page_count=0, roman_numbering=False):
'''
First version of toc generator. Generates table of contents pages
as well as its own footer.
'''
bundle_name = sanitise_latex(casedetails[0])
bundle_logger.debug(f"[CTP]Creating TOC PDF. Parsing settings:")
if dummy:
page_offset = 0
bundle_logger.debug("[CTP]..This is the first pass dummy TOC")
else:
page_offset = frontmatter_offset + 1
bundle_logger.debug(f"[CTP]..Creating the final TOC. The frontmatter offset is {frontmatter_offset}")
if date_setting == "hide_date": # if date disabled: keep the column, just make it small and blank out the header
bundle_logger.debug("[CTP]..Date column disabled")
date_col_hdr = ""
date_col_width = "0.3cm"
elif date_setting == "show_date":
bundle_logger.debug("[CTP]..Date column enabled")
date_col_hdr = "Date"
date_col_width = "3.5cm"
else:
bundle_logger.debug("[CTP]..Date setting blank, enabling date column by default.")
date_col_hdr = "Date"
date_col_width = "3.5cm"
if casedetails[1]:
claimno_sanitised = sanitise_latex(casedetails[1])
# claimno_hdr = f"Claim No. {claimno_sanitised}"
claimno_hdr = claimno_sanitised
bundle_logger.debug(f"[CTP]..Claim number: {claimno_sanitised}")
else:
claimno_hdr = ""
bundle_logger.debug("[CTP]..No claim number provided")
if casedetails[2]:
casename = sanitise_latex(casedetails[2])
bundle_logger.debug(f"[CTP]..Case name: {casename}")
else:
casename = ""
bundle_logger.debug("[CTP]..No case name provided")
index_font_family = None
if not roman_numbering:
# parse index font setting
# set starting page to be one more than the length_of_coversheet
starting_page = length_of_coversheet + 1
if index_font_setting == "sans":
index_font_family = "phv" # LaTeX font family for Helvetica, see https://www.overleaf.com/learn/latex/Font_typefaces#Reference_guide
bundle_logger.debug("[CTP]..Sans-serif font selected for TOC")
elif index_font_setting == "serif":
index_font_family = "ppl" # LaTeX font family for Palatino
bundle_logger.debug("[CTP]..Serif font selected for TOC")
elif index_font_setting == "mono":
index_font_family = "pcr" # LaTeX font family for Courier
bundle_logger.debug("[CTP]..Monospace font selected for TOC")
else:
index_font_family = "" # Default to Computer modern
bundle_logger.debug("[CTP]..No font setting provided, using default font for TOC")
# parse alignment setting
if page_num_alignment == "left":
footer_alignment_setting = r"LO LE"
bundle_logger.debug("[MPNP]..Left alignment selected for page numbers")
elif page_num_alignment == "right":
footer_alignment_setting = r"RO RE"
bundle_logger.debug("[MPNP]..Right alignment selected for page numbers")
elif page_num_alignment == "centre":
footer_alignment_setting = r"CO CE"
bundle_logger.debug("[MPNP]..Centre alignment selected for page numbers")
else:
footer_alignment_setting = r"CO CE"
bundle_logger.debug("[MPNP]..Defaulting to centre alignment for page numbers")
# parse font setting