How to extract text from the PDF file and save to database using PHP
Before some time my friend ask me that How to extract PDF content using PHP ? as he want parse PDF files uploaded by users on a website and save the details to database. To fulfill this requirement after a time i used this PHP class to extract text from the PDF file . This php class extract the text content from PDF files’s content as text. Once we got the data in the text format we can manipulate that data using Regular Expressions and save into our database.
<?php /* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA DESCRIPTION: This is a class to convert PDF files into ASCII text or so called PDF text extraction. It will ignore anything that is not addressed as text within the PDF and any layout. Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode PURPOSE(S): Most likely for people that want their PDF to be searchable. SYNTAX: include('class.pdf2text.php'); $a = new PDF2Text(); $a->setFilename('test.pdf'); $a->decodePDF(); echo $a->output(); ALTERNATIVES: Other excellent options to search within a PDF: - Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution - pdflib TET (http://www.pdflib.com/products/tet/) - Online converter: http://snowtide.com/PDFTextStream */ class PDF2Text { // Some settings var $multibyte = 2; // Use setUnicode(TRUE|FALSE) var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) // Variables var $filename = ''; var $decodedtext = ''; function setFilename($filename) { // Reset $this->decodedtext = ''; $this->filename = $filename; } function output($echo = false) { if($echo) echo $this->decodedtext; else return $this->decodedtext; } function setUnicode($input) { // 4 for unicode. But 2 should work in most cases just fine if($input == true) $this->multibyte = 4; else $this->multibyte = 2; } function decodePDF() { // Read the data from pdf file $infile = @file_get_contents($this->filename, FILE_BINARY); if (empty($infile)) return ""; // Get all text data. $transformations = array(); $texts = array(); // Get the list of all objects. preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); $objects = @$objects[1]; // Select objects with streams. for ($i = 0; $i < count($objects); $i++) { $currentObject = $objects[$i]; // Check if an object includes data stream. if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { $stream = ltrim($stream[1]); // Check object parameters and look for text data. $options = $this->getObjectOptions($currentObject); if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) continue; // Hack, length doesnt always seem to be correct unset($options["Length"]); // So, we have text data. Decode it. $data = $this->getDecodedStream($stream, $options); if (strlen($data)) { if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { $textContainers = @$textContainers[1]; $this->getDirtyTexts($texts, $textContainers); } else $this->getCharTransformations($transformations, $data); } } } // Analyze text blocks taking into account character transformations and return results. $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); } function decodeAsciiHex($input) { $output = ""; $isOdd = true; $isComment = false; for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { $c = $input[$i]; if($isComment) { if ($c == '\r' || $c == '\n') $isComment = false; continue; } switch($c) { case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; case '%': $isComment = true; break; default: $code = hexdec($c); if($code === 0 && $c != '0') return ""; if($isOdd) $codeHigh = $code; else $output .= chr($codeHigh * 16 + $code); $isOdd = !$isOdd; break; } } if($input[$i] != '>') return ""; if($isOdd) $output .= chr($codeHigh * 16); return $output; } function decodeAscii85($input) { $output = ""; $isComment = false; $ords = array(); for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { $c = $input[$i]; if($isComment) { if ($c == '\r' || $c == '\n') $isComment = false; continue; } if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') continue; if ($c == '%') { $isComment = true; continue; } if ($c == 'z' && $state === 0) { $output .= str_repeat(chr(0), 4); continue; } if ($c < '!' || $c > 'u') return ""; $code = ord($input[$i]) & 0xff; $ords[$state++] = $code - ord('!'); if ($state == 5) { $state = 0; for ($sum = 0, $j = 0; $j < 5; $j++) $sum = $sum * 85 + $ords[$j]; for ($j = 3; $j >= 0; $j--) $output .= chr($sum >> ($j * 8)); } } if ($state === 1) return ""; elseif ($state > 1) { for ($i = 0, $sum = 0; $i < $state; $i++) $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); for ($i = 0; $i < $state - 1; $i++) $ouput .= chr($sum >> ((3 - $i) * 8)); } return $output; } function decodeFlate($input) { return gzuncompress($input); } function getObjectOptions($object) { $options = array(); if (preg_match("#<<(.*)>>#ismU", $object, $options)) { $options = explode("/", $options[1]); @array_shift($options); $o = array(); for ($j = 0; $j < @count($options); $j++) { $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); if (strpos($options[$j], " ") !== false) { $parts = explode(" ", $options[$j]); $o[$parts[0]] = $parts[1]; } else $o[$options[$j]] = true; } $options = $o; unset($o); } return $options; } function getDecodedStream($stream, $options) { $data = ""; if (empty($options["Filter"])) $data = $stream; else { $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); $_stream = substr($stream, 0, $length); foreach ($options as $key => $value) { if ($key == "ASCIIHexDecode") $_stream = $this->decodeAsciiHex($_stream); if ($key == "ASCII85Decode") $_stream = $this->decodeAscii85($_stream); if ($key == "FlateDecode") $_stream = $this->decodeFlate($_stream); if ($key == "Crypt") { // TO DO } } $data = $_stream; } return $data; } function getDirtyTexts(&$texts, $textContainers) { for ($j = 0; $j < count($textContainers); $j++) { if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, @$parts[1]); elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, @$parts[1]); elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) $texts = array_merge($texts, @$parts[1]); } } function getCharTransformations(&$transformations, $stream) { preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); for ($j = 0; $j < count($chars); $j++) { $count = $chars[$j][1]; $current = explode("\n", trim($chars[$j][2])); for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) $transformations[str_pad($map[1], 4, "0")] = $map[2]; } } for ($j = 0; $j < count($ranges); $j++) { $count = $ranges[$j][1]; $current = explode("\n", trim($ranges[$j][2])); for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { $from = hexdec($map[1]); $to = hexdec($map[2]); $_from = hexdec($map[3]); for ($m = $from, $n = 0; $m <= $to; $m++, $n++) $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { $from = hexdec($map[1]); $to = hexdec($map[2]); $parts = preg_split("#\s+#", trim($map[3])); for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); } } } } function getTextUsingTransformations($texts, $transformations) { $document = ""; for ($i = 0; $i < count($texts); $i++) { $isHex = false; $isPlain = false; $hex = ""; $plain = ""; for ($j = 0; $j < strlen($texts[$i]); $j++) { $c = $texts[$i][$j]; switch($c) { case "<": $hex = ""; $isHex = true; break; case ">": $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) for ($k = 0; $k < count($hexs); $k++) { $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero if (isset($transformations[$chex])) $chex = $transformations[$chex]; $document .= html_entity_decode("&#x".$chex.";"); } $isHex = false; break; case "(": $plain = ""; $isPlain = true; break; case ")": $document .= $plain; $isPlain = false; break; case "\\": $c2 = $texts[$i][$j + 1]; if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; elseif ($c2 == "n") $plain .= '\n'; elseif ($c2 == "r") $plain .= '\r'; elseif ($c2 == "t") $plain .= '\t'; elseif ($c2 == "b") $plain .= '\b'; elseif ($c2 == "f") $plain .= '\f'; elseif ($c2 >= '0' && $c2 <= '9') { $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); $j += strlen($oct) - 1; $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); } $j++; break; default: if ($isHex) $hex .= $c; if ($isPlain) $plain .= $c; break; } } $document .= "\n"; } return $document; } } ?>
2 comments on “How to extract text from the PDF file and save to database using PHP”
This is a topic close to my heart cheers. Thanks
EXCELLENT Post.thanks for share..more delay.