Read text from PDF files

The following example shows how you can use GemBox.Pdf to easily read the text content of each page in your PDF document.

C#
VB.NET

View on GitHub

using GemBox.Pdf;
using System;

class Program
{
    static void Main()
    {
        // If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        // Iterate through PDF pages and extract each page's Unicode text content.
        using (var document = PdfDocument.Load("%InputFileName%"))
        {
            foreach (var page in document.Pages)
            {
                Console.WriteLine(page.Content.ToString());
            }
        }
    }
}

Imports GemBox.Pdf
Imports System

Module Program

    Sub Main()

        ' If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        ' Iterate through PDF pages and extract each page's Unicode text content.
        Using document = PdfDocument.Load("%InputFileName%")

            For Each page In document.Pages

                Console.WriteLine(page.Content.ToString())
            Next
        End Using
    End Sub
End Module

Reading PDF file's text in C# and VB.NET — Screenshot of read text from input PDF

Reading additional information about a text

The PdfTextContent elements can be used to extract additional information about a text such as its bounds, font, and color, as shown in the next example.

C#
VB.NET

View on GitHub

using GemBox.Pdf;
using GemBox.Pdf.Content;
using System;

class Program
{
    static void Main()
    {
        // If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        // Iterate through all PDF pages and through each page's content elements,
        // and retrieve only the text content elements.
        using (var document = PdfDocument.Load("%InputFileName%"))
        {
            foreach (var page in document.Pages)
            {
                var contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator();
                while (contentEnumerator.MoveNext())
                {
                    if (contentEnumerator.Current.ElementType == PdfContentElementType.Text)
                    {
                        var textElement = (PdfTextContent)contentEnumerator.Current;

                        var text = textElement.ToString();
                        var font = textElement.Format.Text.Font;
                        var color = textElement.Format.Fill.Color;
                        var bounds = textElement.Bounds;

                        contentEnumerator.Transform.Transform(ref bounds);

                        // Read the text content element's additional information.
                        Console.WriteLine($"Unicode text: {text}");
                        Console.WriteLine($"Font name: {font.Face.Family.Name}");
                        Console.WriteLine($"Font size: {font.Size}");
                        Console.WriteLine($"Font style: {font.Face.Style}");
                        Console.WriteLine($"Font weight: {font.Face.Weight}");

                        if (color.TryGetRgb(out double red, out double green, out double blue))
                            Console.WriteLine($"Color: Red={red}, Green={green}, Blue={blue}");

                        Console.WriteLine($"Bounds: Left={bounds.Left:0.00}, Bottom={bounds.Bottom:0.00}, Right={bounds.Right:0.00}, Top={bounds.Top:0.00}");
                        Console.WriteLine();
                    }
                }
            }
        }
    }
}

Imports GemBox.Pdf
Imports GemBox.Pdf.Content
Imports System

Module Program

    Sub Main()

        ' If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        ' Iterate through all PDF pages and through each page's content elements,
        ' and retrieve only the text content elements.
        Using document = PdfDocument.Load("%InputFileName%")

            For Each page In document.Pages

                Dim contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator()
                While contentEnumerator.MoveNext()

                    If contentEnumerator.Current.ElementType = PdfContentElementType.Text Then

                        Dim textElement = CType(contentEnumerator.Current, PdfTextContent)

                        Dim text = textElement.ToString()
                        Dim font = textElement.Format.Text.Font
                        Dim color = textElement.Format.Fill.Color
                        Dim bounds = textElement.Bounds

                        contentEnumerator.Transform.Transform(bounds)

                        ' Read the text content element's additional information.
                        Console.WriteLine($"Unicode text: {text}")
                        Console.WriteLine($"Font name: {font.Face.Family.Name}")
                        Console.WriteLine($"Font size: {font.Size}")
                        Console.WriteLine($"Font style: {font.Face.Style}")
                        Console.WriteLine($"Font weight: {font.Face.Weight}")

                        Dim red, green, blue As Double
                        If color.TryGetRgb(red, green, blue) Then Console.WriteLine($"Color: Red={red}, Green={green}, Blue={blue}")

                        Console.WriteLine($"Bounds: Left={bounds.Left:0.00}, Bottom={bounds.Bottom:0.00}, Right={bounds.Right:0.00}, Top={bounds.Top:0.00}")
                        Console.WriteLine()
                    End If
                End While
            Next
        End Using
    End Sub
End Module

Retrieving PDF text elements in C# and VB.NET — Screenshot of read text elements from input PDF

GemBox.Pdf simplifies PDF page content operations by representing the content as a sequence of parsed, or compiled, elements, such as text, path, and external objects (images and forms). For more information see the Content Streams and Resources help page.

Reading text from a specific rectangular area

With GemBox.Pdf, you can extract a PDF document's text from a specific rectangular area. To do this, you define the bounds of the targeted area and retrieve only the PdfTextContent elements that are within it, as shown in the next example.

C#
VB.NET

View on GitHub

using GemBox.Pdf;
using GemBox.Pdf.Content;
using System;

class Program
{
    static void Main()
    {
        // If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        var pageIndex = 0;
        double areaLeft = 400, areaRight = 550, areaBottom = 680, areaTop = 720;

        using (var document = PdfDocument.Load("%InputFileName%"))
        {
            // Retrieve first page object.
            var page = document.Pages[pageIndex];

            // Retrieve text content elements that are inside specified area on the first page.
            var contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator();
            while (contentEnumerator.MoveNext())
            {
                if (contentEnumerator.Current.ElementType == PdfContentElementType.Text)
                {
                    var textElement = (PdfTextContent)contentEnumerator.Current;

                    var bounds = textElement.Bounds;

                    contentEnumerator.Transform.Transform(ref bounds);

                    if (bounds.Left > areaLeft && bounds.Right < areaRight &&
                        bounds.Bottom > areaBottom && bounds.Top < areaTop)
                    {
                        Console.Write(textElement.ToString());
                    }
                }
            }
        }
    }
}

Imports GemBox.Pdf
Imports GemBox.Pdf.Content
Imports System

Module Program

    Sub Main()

        ' If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        Dim pageIndex = 0
        Dim areaLeft As Double = 400, areaRight As Double = 550, areaBottom As Double = 680, areaTop As Double = 720

        Using document = PdfDocument.Load("%InputFileName%")

            ' Retrieve first page object.
            Dim page = document.Pages(pageIndex)

            ' Retrieve text content elements that are inside specified area on the first page.
            Dim contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator()
            While contentEnumerator.MoveNext()

                If contentEnumerator.Current.ElementType = PdfContentElementType.Text Then

                    Dim textElement = CType(contentEnumerator.Current, PdfTextContent)

                    Dim bounds = textElement.Bounds

                    contentEnumerator.Transform.Transform(bounds)

                    If bounds.Left > areaLeft AndAlso bounds.Right < areaRight AndAlso
                        bounds.Bottom > areaBottom AndAlso bounds.Top < areaTop Then

                        Console.Write(textElement.ToString())
                    End If
                End If
            End While
        End Using
    End Sub
End Module

Reading PDF text from specified area in C# and VB.NET — Screenshot of read text elements in specified area from input PDF

Next steps

GemBox.Pdf is a .NET component that enables developers to read, merge and split PDF files or execute low-level object manipulations from .NET applications in a simple and efficient way.

Download Buy

Read text from PDF files

Reading additional information about a text

Reading text from a specific rectangular area

See also

Next steps