SimplePdfReader.cs

47 lines | 1.438 kB Blame History Raw Download
using System;
using System.Text;
using System.IO;
using System.Threading.Tasks;

using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

namespace Tools.PdfProvider
{
    public class SimplePdfReader
    {
        public Task<string> ReadPdfAsync(Stream stream)
        {
            StringBuilder text = new StringBuilder();

            using (PdfReader iTextReader = new PdfReader(stream))
            using (PdfDocument pdfDoc = new PdfDocument(iTextReader))
            {
                int numberofpages = pdfDoc.GetNumberOfPages();
                for (int page = 1; page <= numberofpages; page++)
                {
                    ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                    string currentText = PdfTextExtractor.GetTextFromPage(
                        pdfDoc.GetPage(page), 
                        strategy
                        );

                    //currentText = Encoding.UTF8.GetString(
                    //    ASCIIEncoding.Convert(
                    //        Encoding.Default,
                    //        Encoding.UTF8,
                    //        Encoding.Default.GetBytes(currentText)
                    //        )
                    //    );
                    text.Append(currentText);
                }
            }

            return Task.FromResult(
                text.ToString()
                );
        }
    }
}