招标
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

799 lines
27 KiB

<?php
/**
* @file
* This file is part of the PdfParser library.
*
* @author Sébastien MALOT <sebastien@malot.fr>
* @date 2017-01-03
*
* @license LGPLv3
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <sebastien@malot.fr>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/
namespace Smalot\PdfParser;
use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Element\ElementMissing;
use Smalot\PdfParser\Element\ElementNull;
use Smalot\PdfParser\Element\ElementXRef;
class Page extends PDFObject
{
/**
* @var Font[]
*/
protected $fonts = null;
/**
* @var PDFObject[]
*/
protected $xobjects = null;
/**
* @var array
*/
protected $dataTm = null;
/**
* @return Font[]
*/
public function getFonts()
{
if (null !== $this->fonts) {
return $this->fonts;
}
$resources = $this->get('Resources');
if (method_exists($resources, 'has') && $resources->has('Font')) {
if ($resources->get('Font') instanceof ElementMissing) {
return [];
}
if ($resources->get('Font') instanceof Header) {
$fonts = $resources->get('Font')->getElements();
} else {
$fonts = $resources->get('Font')->getHeader()->getElements();
}
$table = [];
foreach ($fonts as $id => $font) {
if ($font instanceof Font) {
$table[$id] = $font;
// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $font;
}
}
}
return $this->fonts = $table;
}
return [];
}
/**
* @param string $id
*
* @return Font|null
*/
public function getFont($id)
{
$fonts = $this->getFonts();
if (isset($fonts[$id])) {
return $fonts[$id];
}
// According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
// "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
// Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
if (isset($fonts[$id])) {
return $fonts[$id];
} else {
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if (isset($fonts[$id])) {
return $fonts[$id];
}
}
return null;
}
/**
* Support for XObject
*
* @return PDFObject[]
*/
public function getXObjects()
{
if (null !== $this->xobjects) {
return $this->xobjects;
}
$resources = $this->get('Resources');
if (method_exists($resources, 'has') && $resources->has('XObject')) {
if ($resources->get('XObject') instanceof Header) {
$xobjects = $resources->get('XObject')->getElements();
} else {
$xobjects = $resources->get('XObject')->getHeader()->getElements();
}
$table = [];
foreach ($xobjects as $id => $xobject) {
$table[$id] = $xobject;
// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $xobject;
}
}
return $this->xobjects = $table;
}
return [];
}
/**
* @param string $id
*
* @return PDFObject|null
*/
public function getXObject($id)
{
$xobjects = $this->getXObjects();
if (isset($xobjects[$id])) {
return $xobjects[$id];
}
return null;
/*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if (isset($xobjects[$id])) {
return $xobjects[$id];
} else {
return null;
}*/
}
/**
* @param Page $page
*
* @return string
*/
public function getText(self $page = null)
{
if ($contents = $this->get('Contents')) {
if ($contents instanceof ElementMissing) {
return '';
} elseif ($contents instanceof ElementNull) {
return '';
} elseif ($contents instanceof PDFObject) {
$elements = $contents->getHeader()->getElements();
if (is_numeric(key($elements))) {
$new_content = '';
foreach ($elements as $element) {
if ($element instanceof ElementXRef) {
$new_content .= $element->getObject()->getContent();
} else {
$new_content .= $element->getContent();
}
}
$header = new Header([], $this->document);
$contents = new PDFObject($this->document, $header, $new_content);
}
} elseif ($contents instanceof ElementArray) {
// Create a virtual global content.
$new_content = '';
foreach ($contents->getContent() as $content) {
$new_content .= $content->getContent()."\n";
}
$header = new Header([], $this->document);
$contents = new PDFObject($this->document, $header, $new_content);
}
return $contents->getText($this);
}
return '';
}
/**
* @param Page $page
*
* @return array
*/
public function getTextArray(self $page = null)
{
if ($contents = $this->get('Contents')) {
if ($contents instanceof ElementMissing) {
return [];
} elseif ($contents instanceof ElementNull) {
return [];
} elseif ($contents instanceof PDFObject) {
$elements = $contents->getHeader()->getElements();
if (is_numeric(key($elements))) {
$new_content = '';
/** @var PDFObject $element */
foreach ($elements as $element) {
if ($element instanceof ElementXRef) {
$new_content .= $element->getObject()->getContent();
} else {
$new_content .= $element->getContent();
}
}
$header = new Header([], $this->document);
$contents = new PDFObject($this->document, $header, $new_content);
}
} elseif ($contents instanceof ElementArray) {
// Create a virtual global content.
$new_content = '';
/** @var PDFObject $content */
foreach ($contents->getContent() as $content) {
$new_content .= $content->getContent()."\n";
}
$header = new Header([], $this->document);
$contents = new PDFObject($this->document, $header, $new_content);
}
return $contents->getTextArray($this);
}
return [];
}
/**
* Gets all the text data with its internal representation of the page.
*
* @return array An array with the data and the internal representation
*/
public function extractRawData()
{
/*
* Now you can get the complete content of the object with the text on it
*/
$extractedData = [];
$content = $this->get('Contents');
$values = $content->getContent();
if (isset($values) && \is_array($values)) {
$text = '';
foreach ($values as $section) {
$text .= $section->getContent();
}
$sectionsText = $this->getSectionsText($text);
foreach ($sectionsText as $sectionText) {
$commandsText = $this->getCommandsText($sectionText);
foreach ($commandsText as $command) {
$extractedData[] = $command;
}
}
} else {
$sectionsText = $content->getSectionsText($content->getContent());
foreach ($sectionsText as $sectionText) {
$extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
$commandsText = $content->getCommandsText($sectionText);
foreach ($commandsText as $command) {
$extractedData[] = $command;
}
}
}
return $extractedData;
}
/**
* Gets all the decoded text data with it internal representation from a page.
*
* @param array $extractedRawData the extracted data return by extractRawData or
* null if extractRawData should be called
*
* @return array An array with the data and the internal representation
*/
public function extractDecodedRawData($extractedRawData = null)
{
if (!isset($extractedRawData) || !$extractedRawData) {
$extractedRawData = $this->extractRawData();
}
$currentFont = null;
foreach ($extractedRawData as &$command) {
if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
$data = $command['c'];
if (!\is_array($data)) {
$tmpText = '';
if (isset($currentFont)) {
$tmpText = $currentFont->decodeOctal($data);
//$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
}
$tmpText = str_replace(
['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
['\\', '(', ')', "\n", "\r", "\t", ' '],
$tmpText
);
$tmpText = utf8_encode($tmpText);
if (isset($currentFont)) {
$tmpText = $currentFont->decodeContent($tmpText);
}
$command['c'] = $tmpText;
continue;
}
$numText = \count($data);
for ($i = 0; $i < $numText; ++$i) {
if (0 != ($i % 2)) {
continue;
}
$tmpText = $data[$i]['c'];
$decodedText = '';
if (isset($currentFont)) {
$decodedText = $currentFont->decodeOctal($tmpText);
//$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
}
$decodedText = str_replace(
['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
['\\', '(', ')', "\n", "\r", "\t", ' '],
$decodedText
);
$decodedText = utf8_encode($decodedText);
if (isset($currentFont)) {
$decodedText = $currentFont->decodeContent($decodedText);
}
$command['c'][$i]['c'] = $decodedText;
continue;
}
} elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
$fontId = explode(' ', $command['c'])[0];
$currentFont = $this->getFont($fontId);
continue;
}
}
return $extractedRawData;
}
/**
* Gets just the Text commands that are involved in text positions and
* Text Matrix (Tm)
*
* It extract just the PDF commands that are involved with text positions, and
* the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
*
* @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
* If it is null, the method extractDecodeRawData is called.
*
* @return array An array with the text command of the page
*/
public function getDataCommands($extractedDecodedRawData = null)
{
if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
$extractedDecodedRawData = $this->extractDecodedRawData();
}
$extractedData = [];
foreach ($extractedDecodedRawData as $command) {
switch ($command['o']) {
/*
* BT
* Begin a text object, inicializind the Tm and Tlm to identity matrix
*/
case 'BT':
$extractedData[] = $command;
break;
/*
* ET
* End a text object, discarding the text matrix
*/
case 'ET':
$extractedData[] = $command;
break;
/*
* leading TL
* Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
* Initial value: 0
*/
case 'TL':
$extractedData[] = $command;
break;
/*
* tx ty Td
* Move to the start of the next line, offset form the start of the
* current line by tx, ty.
*/
case 'Td':
$extractedData[] = $command;
break;
/*
* tx ty TD
* Move to the start of the next line, offset form the start of the
* current line by tx, ty. As a side effect, this operator set the leading
* parameter in the text state. This operator has the same effect as the
* code:
* -ty TL
* tx ty Td
*/
case 'TD':
$extractedData[] = $command;
break;
/*
* a b c d e f Tm
* Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
* all numbers, and the initial value for Tm and Tlm is the identity matrix
* [1 0 0 1 0 0]
*/
case 'Tm':
$extractedData[] = $command;
break;
/*
* T*
* Move to the start of the next line. This operator has the same effect
* as the code:
* 0 Tl Td
* Where Tl is the current leading parameter in the text state.
*/
case 'T*':
$extractedData[] = $command;
break;
/*
* string Tj
* Show a Text String
*/
case 'Tj':
$extractedData[] = $command;
break;
/*
* string '
* Move to the next line and show a text string. This operator has the
* same effect as the code:
* T*
* string Tj
*/
case "'":
$extractedData[] = $command;
break;
/*
* aw ac string "
* Move to the next lkine and show a text string, using aw as the word
* spacing and ac as the character spacing. This operator has the same
* effect as the code:
* aw Tw
* ac Tc
* string '
* Tw set the word spacing, Tw, to wordSpace.
* Tc Set the character spacing, Tc, to charsSpace.
*/
case '"':
$extractedData[] = $command;
break;
/*
* array TJ
* Show one or more text strings allow individual glyph positioning.
* Each lement of array con be a string or a number. If the element is
* a string, this operator shows the string. If it is a number, the
* operator adjust the text position by that amount; that is, it translates
* the text matrix, Tm. This amount is substracted form the current
* horizontal or vertical coordinate, depending on the writing mode.
* in the default coordinate system, a positive adjustment has the effect
* of moving the next glyph painted either to the left or down by the given
* amount.
*/
case 'TJ':
$extractedData[] = $command;
break;
default:
}
}
return $extractedData;
}
/**
* Gets the Text Matrix of the text in the page
*
* Return an array where every item is an array where the first item is the
* Text Matrix (Tm) and the second is a string with the text data. The Text matrix
* is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
* text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
*
* @param array $dataCommands the data extracted by getDataCommands
* if null getDataCommands is called
*
* @return array an array with the data of the page including the Tm information
* of any text in the page
*/
public function getDataTm($dataCommands = null)
{
if (!isset($dataCommands) || !$dataCommands) {
$dataCommands = $this->getDataCommands();
}
/*
* At the beginning of a text object Tm is the identity matrix
*/
$defaultTm = ['1', '0', '0', '1', '0', '0'];
/*
* Set the text leading used by T*, ' and " operators
*/
$defaultTl = 0;
/*
* Setting where are the X and Y coordinates in the matrix (Tm)
*/
$x = 4;
$y = 5;
$Tx = 0;
$Ty = 0;
$Tm = $defaultTm;
$Tl = $defaultTl;
$extractedTexts = $this->getTextArray();
$extractedData = [];
foreach ($dataCommands as $command) {
$currentText = $extractedTexts[\count($extractedData)];
switch ($command['o']) {
/*
* BT
* Begin a text object, inicializind the Tm and Tlm to identity matrix
*/
case 'BT':
$Tm = $defaultTm;
$Tl = $defaultTl; //review this.
$Tx = 0;
$Ty = 0;
break;
/*
* ET
* End a text object, discarding the text matrix
*/
case 'ET':
$Tm = $defaultTm;
$Tl = $defaultTl; //review this
$Tx = 0;
$Ty = 0;
break;
/*
* leading TL
* Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
* Initial value: 0
*/
case 'TL':
$Tl = (float) $command['c'];
break;
/*
* tx ty Td
* Move to the start of the next line, offset form the start of the
* current line by tx, ty.
*/
case 'Td':
$coord = explode(' ', $command['c']);
$Tx += (float) $coord[0];
$Ty += (float) $coord[1];
$Tm[$x] = (string) $Tx;
$Tm[$y] = (string) $Ty;
break;
/*
* tx ty TD
* Move to the start of the next line, offset form the start of the
* current line by tx, ty. As a side effect, this operator set the leading
* parameter in the text state. This operator has the same effect as the
* code:
* -ty TL
* tx ty Td
*/
case 'TD':
$coord = explode(' ', $command['c']);
$Tl = (float) $coord[1];
$Tx += (float) $coord[0];
$Ty -= (float) $coord[1];
$Tm[$x] = (string) $Tx;
$Tm[$y] = (string) $Ty;
break;
/*
* a b c d e f Tm
* Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
* all numbers, and the initial value for Tm and Tlm is the identity matrix
* [1 0 0 1 0 0]
*/
case 'Tm':
$Tm = explode(' ', $command['c']);
$Tx = (float) $Tm[$x];
$Ty = (float) $Tm[$y];
break;
/*
* T*
* Move to the start of the next line. This operator has the same effect
* as the code:
* 0 Tl Td
* Where Tl is the current leading parameter in the text state.
*/
case 'T*':
$Ty -= $Tl;
$Tm[$y] = (string) $Ty;
break;
/*
* string Tj
* Show a Text String
*/
case 'Tj':
$extractedData[] = [$Tm, $currentText];
break;
/*
* string '
* Move to the next line and show a text string. This operator has the
* same effect as the code:
* T*
* string Tj
*/
case "'":
$Ty -= $Tl;
$Tm[$y] = (string) $Ty;
$extractedData[] = [$Tm, $currentText];
break;
/*
* aw ac string "
* Move to the next line and show a text string, using aw as the word
* spacing and ac as the character spacing. This operator has the same
* effect as the code:
* aw Tw
* ac Tc
* string '
* Tw set the word spacing, Tw, to wordSpace.
* Tc Set the character spacing, Tc, to charsSpace.
*/
case '"':
$data = explode(' ', $currentText);
$Ty -= $Tl;
$Tm[$y] = (string) $Ty;
$extractedData[] = [$Tm, $data[2]]; //Verify
break;
/*
* array TJ
* Show one or more text strings allow individual glyph positioning.
* Each lement of array con be a string or a number. If the element is
* a string, this operator shows the string. If it is a number, the
* operator adjust the text position by that amount; that is, it translates
* the text matrix, Tm. This amount is substracted form the current
* horizontal or vertical coordinate, depending on the writing mode.
* in the default coordinate system, a positive adjustment has the effect
* of moving the next glyph painted either to the left or down by the given
* amount.
*/
case 'TJ':
$extractedData[] = [$Tm, $currentText];
break;
default:
}
}
$this->dataTm = $extractedData;
return $extractedData;
}
/**
* Gets text data that are around the given coordinates (X,Y)
*
* If the text is in near the given coordinates (X,Y) (or the TM info),
* the text is returned. The extractedData return by getDataTm, could be use to see
* where is the coordinates of a given text, using the TM info for it.
*
* @param float $x The X value of the coordinate to search for. if null
* just the Y value is considered (same Row)
* @param float $y The Y value of the coordinate to search for
* just the X value is considered (same column)
* @param float $xError The value less or more to consider an X to be "near"
* @param float $yError The value less or more to consider an Y to be "near"
*
* @return array An array of text that are near the given coordinates. If no text
* "near" the x,y coordinate, an empty array is returned. If Both, x
* and y coordinates are null, null is returned.
*/
public function getTextXY($x = null, $y = null, $xError = 0, $yError = 0)
{
if (!isset($this->dataTm) || !$this->dataTm) {
$this->getDataTm();
}
if (null !== $x) {
$x = (float) $x;
}
if (null !== $y) {
$y = (float) $y;
}
if (null === $x && null === $y) {
return [];
}
$xError = (float) $xError;
$yError = (float) $yError;
$extractedData = [];
foreach ($this->dataTm as $item) {
$tm = $item[0];
$xTm = (float) $tm[4];
$yTm = (float) $tm[5];
$text = $item[1];
if (null === $y) {
if (($xTm >= ($x - $xError)) &&
($xTm <= ($x + $xError))) {
$extractedData[] = [$tm, $text];
continue;
}
}
if (null === $x) {
if (($yTm >= ($y - $yError)) &&
($yTm <= ($y + $yError))) {
$extractedData[] = [$tm, $text];
continue;
}
}
if (($xTm >= ($x - $xError)) &&
($xTm <= ($x + $xError)) &&
($yTm >= ($y - $yError)) &&
($yTm <= ($y + $yError))) {
$extractedData[] = [$tm, $text];
continue;
}
}
return $extractedData;
}
}