1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
| import fitz
import sys
def read_clip(page,x0,y0,x1,y1):
return page.get_text(clip=fitz.Rect(x0,y0,x1,y1))
def highlight(pdf_path,output_path,texts):
# READ IN PDF
doc = fitz.open(pdf_path)
mupdf_page = doc.load_page(0)
page_width = mupdf_page.rect.width
hightOffset=15
for page in doc:
sites=[]
for text in texts:
sites.extend(page.search_for(text))
for inst in sites:
x0,y0,x1,y1=inst
ys=y0
next_text=str(read_clip(page,x0,y0+hightOffset,x1,y1+hightOffset)).strip()
while next_text=="":
next_row=str(read_clip(page,0,y0+hightOffset,page_width,y1+hightOffset)).strip()
if next_row=="":
break
y0=y0+hightOffset
y1=y1+hightOffset
next_text=str(read_clip(page,x0,y0+hightOffset,x1,y1+hightOffset)).strip()
highlight=page.add_highlight_annot((-5,ys-2,page_width+5, y1))
highlight.set_colors(stroke=[1, 0.94 ,0.4])
highlight.update()
# OUTPUT
doc.save(output_path, garbage=4, deflate=True, clean=True)
# "python3 main.py in.pdf output.pdf needles_text"
in_pdf = sys.argv[1] if len(sys.argv) > 1 else ''
out_path = sys.argv[2] if len(sys.argv) > 2 else ''
needles_text = sys.argv[3] if len(sys.argv) > 3 else ''
highlight(in_pdf,out_path,needles_text.split(","))
|