Contents

PDF相关

前端预览

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
<div id="pdf-content"></div>

<script src="/path/pdf.min.js"></script>
<script src="/path/pdf.worker.min.js"></script>
<script type="module">
function loadPdf(url, workerSrc, cid, scale = 0) {
    var pdfJsLib = window['pdfjs-dist/build/pdf'];

    pdfJsLib.GlobalWorkerOptions.workerSrc = workerSrc;

    // Asynchronous download of PDF
    var loadingTask = pdfJsLib.getDocument(url);
    loadingTask.promise.then(function (pdf) {
        var pagesCount = pdf.numPages;
        var container = document.getElementById(cid);
        container.innerHTML = '';
        for (var i = 1; i <= pagesCount; i++) {
            pdf.getPage(i).then(function (page) {
                var desiredWidth = container.offsetWidth;
                // console.log(desiredWidth)
                var viewport = page.getViewport({scale: 1,});
                if (scale === 0) {
                    scale = desiredWidth / viewport.width;
                }
                viewport = page.getViewport({scale: scale,});

                var canvas = document.createElement('canvas');
                var context = canvas.getContext('2d');
                canvas.height = viewport.height;
                canvas.width = viewport.width;

                var renderContext = {
                    canvasContext: context,
                    viewport: viewport
                };

                page.render(renderContext);
                var pageContainer = document.createElement('div');
                pageContainer.className = 'pdfPage';
                pageContainer.appendChild(canvas);
                container.appendChild(pageContainer);
            });
        }
    }, function (reason) {
        console.error(reason);
    });
}
loadPdf("/pdf/download/url", "/path/pdf.worker.min.js", "pdf-content");
</script>

合并

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from PyPDF2 import PdfMerger
import sys

def merge_pdfs(pdf_files,out_path):
    merger = PdfMerger()

    for file in pdf_files:
        merger.append(file)

    merger.write(out_path)
    merger.close()


# "python3 main.py test.pdf,test1.pdf output.pdf"
pdf_files = sys.argv[1] if len(sys.argv) > 1 else ''
out_path = sys.argv[2] if len(sys.argv) > 2 else ''
merge_pdfs(pdf_files.split(","),out_path)

高亮显示

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import fitz
import sys

def read_clip(page,x0,y0,x1,y1):
    return page.get_text(clip=fitz.Rect(x0,y0,x1,y1))

def highlight(pdf_path,output_path,texts):
    # READ IN PDF
    doc = fitz.open(pdf_path)
    mupdf_page = doc.load_page(0)
    page_width = mupdf_page.rect.width
    hightOffset=15
    for page in doc:
        sites=[]
        for text in texts:
            sites.extend(page.search_for(text))
        for inst in sites:
            x0,y0,x1,y1=inst
            ys=y0
            next_text=str(read_clip(page,x0,y0+hightOffset,x1,y1+hightOffset)).strip()
            while next_text=="":
                next_row=str(read_clip(page,0,y0+hightOffset,page_width,y1+hightOffset)).strip()
                if next_row=="":
                    break
                y0=y0+hightOffset
                y1=y1+hightOffset
                next_text=str(read_clip(page,x0,y0+hightOffset,x1,y1+hightOffset)).strip()
            highlight=page.add_highlight_annot((-5,ys-2,page_width+5, y1))
            highlight.set_colors(stroke=[1, 0.94 ,0.4])
            highlight.update()

    # OUTPUT
    doc.save(output_path, garbage=4, deflate=True, clean=True)

# "python3 main.py in.pdf output.pdf needles_text"
in_pdf = sys.argv[1] if len(sys.argv) > 1 else ''
out_path = sys.argv[2] if len(sys.argv) > 2 else ''
needles_text = sys.argv[3] if len(sys.argv) > 3 else ''
highlight(in_pdf,out_path,needles_text.split(","))
coffee