GemBox.Document is a .NET component that enables you to read, write, edit, convert, and print document files from your .NET applications using one simple API.

Read and Extract PDF Text in C# and VB.NET

GemBox.Document currently supports reading PDF files and extracting their text content from Paragraph and/or Table elements in C# and VB.NET.

The PDF reader is in beta stage. We plan to improve and extend it over time based on customer feedback.

For more information, see the PDF reader support level section from help.

The following example demonstrates how to read a PDF file from a file path and output document properties and text that matches a specified regular expression to the Console output.

Screenshot
Read and extract Text from PDF Screenshot

See the full code below, use Run Example to execute.

Upload your file(Drag files here)

Download a sample file

using System;
using System.Linq;
using System.Text;
using GemBox.Document;
using GemBox.Document.Tables;
using System.Text.RegularExpressions;

class Sample
{
    [STAThread]
    static void Main(string[] args)
    {
        // If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        DocumentModel document = DocumentModel.Load("CustomInvoice.pdf");

        StringBuilder sb = new StringBuilder();

        // Read PDF file's document properties.
        sb.AppendFormat("Author: {0}", document.DocumentProperties.BuiltIn[BuiltInDocumentProperty.Author]).AppendLine();
        sb.AppendFormat("DateContentCreated: {0}", document.DocumentProperties.BuiltIn[BuiltInDocumentProperty.DateLastSaved]).AppendLine();

        // Sample's input parameter.
        string pattern = @"(?<WorkHours>\d+)\s+(?<UnitPrice>\d+\.\d{2})\s+(?<Total>\d+\.\d{2})";
        Regex regex = new Regex(pattern);

        int row = 0;
        StringBuilder line = new StringBuilder();

        // Read PDF file's text content and match a specified regular expression.
        foreach (Match match in regex.Matches(document.Content.ToString()))
        {
            line.Length = 0;
            line.AppendFormat("Result: {0}: ", ++row);

            // Either write only successfully matched named groups or entire match.
            bool hasAny = false;
            for (int i = 0; i < match.Groups.Count; ++i)
            {
                string groupName = regex.GroupNameFromNumber(i);
                Group matchGroup = match.Groups[i];
                if (matchGroup.Success && groupName != i.ToString())
                {
                    line.AppendFormat("{0}= {1}, ", groupName, matchGroup.Value);
                    hasAny = true;
                }
            }

            if (hasAny)
                line.Length -= 2;
            else
                line.Append(match.Value);

            sb.AppendLine(line.ToString());
        }

        Console.WriteLine(sb.ToString());
    }
}
Imports System
Imports System.Linq
Imports System.Text
Imports GemBox.Document
Imports GemBox.Document.Tables
Imports System.Text.RegularExpressions

Module Samples

    Sub Main()

        ' If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        Dim document As DocumentModel = DocumentModel.Load("CustomInvoice.pdf")

        Dim sb As New StringBuilder()

        ' Read PDF file's document properties.
        sb.AppendFormat("Author: {0}", document.DocumentProperties.BuiltIn(BuiltInDocumentProperty.Author)).AppendLine()
        sb.AppendFormat("DateContentCreated: {0}", document.DocumentProperties.BuiltIn(BuiltInDocumentProperty.DateLastSaved)).AppendLine()

        ' Sample's input parameter.
        Dim pattern As String = "(?<WorkHours>\d+)\s+(?<UnitPrice>\d+\.\d{2})\s+(?<Total>\d+\.\d{2})"
        Dim regex As Regex = New Regex(pattern)

        Dim row As Integer = 0
        Dim line As StringBuilder = New StringBuilder()

        ' Read PDF file's text content and match a specified regular expression.
        For Each match As Match In regex.Matches(document.Content.ToString())
            line.Length = 0
            line.AppendFormat("Result: {0}: ", ++row)

            ' Either write only successfully matched named groups or entire match.
            Dim hasAny As Boolean = False
            For i As Integer = 1 To match.Groups.Count - 1
                Dim groupName As String = regex.GroupNameFromNumber(i)
                Dim matchGroup As Group = match.Groups(i)
                If (matchGroup.Success And groupName <> i.ToString()) Then
                    line.AppendFormat("{0}= {1}, ", groupName, matchGroup.Value)
                    hasAny = True
                End If
            Next

            If (hasAny) Then
                line.Length -= 2
            Else
                line.Append(match.Value)
            End If

            sb.AppendLine(line.ToString())
        Next

        Console.WriteLine(sb.ToString())

    End Sub

End Module

Check next sample.