A simple Go library which enables reading PDF files; forked from https://github.com/ledongthuc/pdf, which was forked from https://github.com/rsc/pdf.
Thuc Le's fork has the most (and slighly more recent) updates compared to all other forks of RSC's pdf repo.
Features
- Get plain text content (without format)
- Get Content (including all font and formatting information)
go get -u github.com/zacharysyoung/rsc-thuc-pdf
package main
import (
"bytes"
"fmt"
pdf "github.com/zacharysyoung/rsc-thuc-pdf"
)
func main() {
pdf.DebugOn = true
content, err := readPdf("test.pdf")
if err != nil {
panic(err)
}
fmt.Println(content)
}
func readPdf(path string) (string, error) {
f, r, _ := pdf.Open(path)
_ = f.Close()
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}
Thuc Le did not implement isSameSentence, not sure what it should/would do, yet.
func readStyledText(path string) (string, error) {
f, r, _ := pdf.Open(path)
defer func() { _ = f.Close() }()
totalPage := r.NumPage()
// need real implementation
isSameSentence := func(thisTxt, lastTxtStyle pdf.Text) bool { panic("not implemented") }
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
var lastTextStyle pdf.Text
texts := p.Content().Text
for _, text := range texts {
if isSameSentence(text, lastTextStyle) {
lastTextStyle.S = lastTextStyle.S + text.S
} else {
fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
lastTextStyle = text
}
}
}
return "", nil
}
func readByRow(path string) {
f, r, _ := pdf.Open(path)
defer func() { _ = f.Close() }()
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() {
continue
}
rows, _ := p.GetTextByRow()
for _, row := range rows {
println(">>>> row: ", row.Position)
for _, word := range row.Content {
fmt.Println(word.S)
}
}
}
}