Module Pdftext


module Pdftext: sig .. end
Parsing Fonts and Extracting Text - Text extraction incomplete.


Data Types



type type3_glpyhs = {
   fontbbox : float * float * float * float;
   fontmatrix : Transform.transform_matrix;
   charprocs : (string * Pdf.pdfobject) list;
   type3_resources : Pdf.pdfobject;
}
type simple_fonttype =
| Type1
| MMType1
| Type3 of type3_glpyhs
| Truetype
type fontmetrics = float array 

type fontfile =
| FontFile of int
| FontFile2 of int
| FontFile3 of int

type fontdescriptor = {
   ascent : float;
   descent : float;
   leading : float;
   avgwidth : float;
   maxwidth : float;
   fontfile : fontfile option;
}
type differences = (string * int) list 

type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding

type simple_font = {
   fonttype : simple_fonttype;
   basefont : string;
   fontmetrics : fontmetrics option;
   fontdescriptor : fontdescriptor option;
   encoding : encoding;
}
type standard_font =
| TimesRoman
| TimesBold
| TimesItalic
| TimesBoldItalic
| Helvetica
| HelveticaBold
| HelveticaOblique
| HelveticaBoldOblique
| Courier
| CourierBold
| CourierOblique
| CourierBoldOblique
| Symbol
| ZapfDingbats
val string_of_standard_font : standard_font -> string
val standard_font_of_name : string -> standard_font option

type cid_system_info = {
   registry : string;
   ordering : string;
   supplement : int;
}
type composite_CIDfont = {
   cid_system_info : cid_system_info;
   cid_basefont : string;
   cid_fontdescriptor : fontdescriptor;
   cid_widths : (int * float) list;
   cid_default_width : int;
}
type cmap_encoding =
| Predefined of string
| CMap of int

type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding

Reading a Font


val read_font : Pdf.pdfdoc -> Pdf.pdfobject -> font
Read a font from a given document and object

Text Extraction


type text_extractor 
The type of text extractors.
val text_extractor_of_font : Pdf.pdfdoc -> Pdf.pdfobject -> text_extractor
Build a text extractor from a document and font object
val codepoints_of_text : text_extractor -> string -> int list
Return a list of unicode points from a given extractor and string (for example from a Pdfpages.Op_Tj or Op_TJ operator).
val utf16be_of_text : text_extractor -> string -> string
Same, but return UTF16BE
val utf16be_of_codepoints : int list -> string
Return UTF16BE from a list of codepoints.
val latin1_string_of_text : text_extractor -> string -> string
Same, but return Latin1 (Lossy)
val decode_char : encoding -> char -> char
Decode a single character code in a standard font
val decode_type3_char : encoding -> char -> string
Decode a single character code in a type3 font to a glyph name
val codepoints_of_textstring : string -> int list
Read the codepoints in a PDF text string, unicode or PDFDocEncoding.
val string_of_font : font -> string
Debug string