Tuesday, September 11, 2018

Use Tesseract OCR with C# to separate receipt images from non-text images

Using TesseractEngine C# wrapper to identify image with text, based on default confidence and learning, flag image as either to move or not to move. This method returns a List<> of file names that can then be used to move the files to a separate directory.


This comes in handy when you have a dump of Phone Images with Camera images mixed with Receipts. You can either manually move files or use this to reduce the manual burden. See attached zip, and here for details of source lib and demo.

More Samples here

Download Source Here

App is run with following params:

ReceiptMover ocr c:\temp\input c:\temp\output *.jpg .4

/// <summary>
        /// Convert image to tif and OCR using TesseractEngine 3.0.2
        /// </summary>
        /// <param name="imgPath"></param>
        /// <param name="dataPath"></param>
        /// <param name="minconf"></param>
        /// <returns></returns>
        static bool OCRImage(string imgPath, string dataPath, float minconf)
        {

           bool r = false;
            bool doKill = false;
            string newPF = imgPath;
            try
            {

               // create tif if neeeded,
                var ext = FileHelper.GetFileNameExtension(imgPath);
                if (!ext.ToUpper().StartsWith("TIF"))
                {
                    WriteLine("CONVERT TO TIF ->....... " + FileHelper.GetFileName(imgPath));
                    using (var imgB = new Bitmap(imgPath)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure
                    {

                       string newF = FileHelper.GetFileName(imgPath).ToUpper().Replace(ext.ToUpper(), "tif".ToUpper());
                        newPF = Paths.Combine(FileHelper.GetFilePath(imgPath), newF);
                        if (File.Exists(newPF))
                            FileHelper.KillFile(newPF);
                        imgB.Save(newPF, System.Drawing.Imaging.ImageFormat.Tiff);
                       
                        WriteLine("SAVED  TO TIF ->....... " + newPF);
                        doKill = true;
                    }
                }


               //ocr tif ,if any text we seperate it from non receipt/text images
                using (var tEngine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) //creating the tesseract OCR engine with English as the language
                {
                   
                    using (var img = Pix.LoadFromFile(newPF)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure
                    {
                        WriteLine("TesseractEngine.Process(img) start at ->....... " + FileHelper.GetFileName(imgPath) + ", " + DateTime.Now.ToString());
                        //using (var page = tEngine.Process(img, PageSegMode.Auto)) //process the specified image
                         //using (var page = tEngine.Process(img)) //process the specified image
                        using (var page = tEngine.Process(img, PageSegMode.SingleColumn)) //process the specified image
                        {

                       
                            WriteLine("TesseractEngine.Process(img) END at -> " + FileHelper.GetFileName(imgPath) + ", " + DateTime.Now.ToString());
                            var text = page.GetText(); //Gets the image's content as plain text.
                                                       // var textO = page.GetHOCRText(0); //Gets the image's content as plain text.
                            WriteLine("tEngine.GetText(img) END at -> " + FileHelper.GetFileName(imgPath) + ", " +  DateTime.Now.ToString());

                           if (!string.IsNullOrEmpty(text))
                            {
                                text = text.Trim().Replace("/n", "");
                            }
                            if (StringUtil.IsValid(text))
                            {

                               Debug.WriteLine("Found text: " + text); //display the text
                                var conf = page.GetMeanConfidence();
                                WriteLine("GetMeanConfidence-> " + conf.ToString()); //Get's the mean confidence that as a percentage of the recognized text.
                                if (conf >= minconf)
                                {
                                    WriteLine("****Found RECEIPT: " + FileHelper.GetFileName(newPF)); //display the text
                                    r = true;
                                }

                           }
                            else
                            {
                                WriteLine("NODATA: " + FileHelper.GetFileName(newPF)); //display the text
                            }

                       }
                    }
                }
            }
            catch (Exception e)
            {
                WriteLine("OCRImage Error: " + e.Message);
            }
            //remove temptiff
            if (doKill)
            {
                FileHelper.KillFile(newPF);
            }
            return r;
        }


No comments:

Post a Comment