![僅從第一頁 TIFF 獲取輸出到 hocr 提取](https://rvso.com/image/1665767/%E5%83%85%E5%BE%9E%E7%AC%AC%E4%B8%80%E9%A0%81%20TIFF%20%E7%8D%B2%E5%8F%96%E8%BC%B8%E5%87%BA%E5%88%B0%20hocr%20%E6%8F%90%E5%8F%96.png)
我在下面給出的程式碼中尋求您的指導。我正在運行此程式碼以將多頁 TIFF 文字提取為 hocr 格式。我從 TIFF 的第一頁獲取輸出,但其餘頁面被省略。
# Python program to extract text from all the images in a folder
# storing the text in corresponding files in a different folder
# This is for hocr output, but there is error of getting only 1 page
from PIL import Image
import pytesseract as pt
import os
pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
def main():
# path for the folder for getting the raw images
path ="D:\\input"
# path for the folder for getting the output
tempPath ="D:\\output"
# iterating the images inside the folder
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
# applying ocr using pytesseract for python
text = pt.image_to_pdf_or_hocr(img, extension = 'hocr', config = (r'--oem 3 --psm 6'), lang ="eng")
fullTempPath = os.path.join(tempPath, 'time_'+imageName+".hocr")
print(text)
# saving the text for every image in a separate .hocr file
file1 = open(fullTempPath, "wb")
file1.write(text)
file1.close()
if __name__ == '__main__':
main()
答案1
編輯:
我檢查過它可以filename
代替PILLOW.Image
text = pt.image_to_pdf_or_hocr('D:\\input\\Best time to visit.tiff', extension='hocr', config=(r'--oem 3 --psm 6'), lang="eng")
因此它可以tesseract
與原始文件一起運行tiff
,並將所有頁面轉換為一份文字hocr
。
原答案:
我tiff
從評論中的連結中獲取了您的程式碼,並創建了將每個頁面保存在單獨文件中的程式碼。它用於img.seek(page)
選擇頁面。它對我來說適用於你的文件。
from PIL import Image
import os
folder = '/home/furas/Desktop'
filename = 'Best time to visit.tiff'
img = Image.open(os.path.join(folder, filename))
page = 0
while True:
try:
img.seek(page)
filename = f'page-{page+1}.png'
print('saving...', filename)
img.save(os.path.join(folder, filename))
page += 1
except EOFError:
# Not enough frames in img
break
在你的程式碼中類似的東西對我有用
from PIL import Image
import pytesseract as pt
import os
pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# path for the folder for getting the raw images
path = "D:\\input"
# path for the folder for getting the output
tempPath = "D:\\output"
# iterating the images inside the folder
for imageName in os.listdir(path):
# only images
if imageName.lower().endswith(('.tiff', '.jpg', '.png')):
print(imageName)
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
page = 0
while True:
try:
img.seek(page)
text = pt.image_to_pdf_or_hocr(img, extension='hocr', config=(r'--oem 3 --psm 6'), lang="eng")
print('page...', page)
page += 1
fullTempPath = os.path.join(tempPath, f"time_{imageName}_{page}.hocr")
#print(text)
# saving the text for every image in a separate .hocr file
file1 = open(fullTempPath, "wb")
file1.write(text)
file1.close()
except EOFError:
# Not enough frames in img
break
它必須分開寫入每一頁,因為如果您嘗試在一個文件中.hocr
寫入許多頁,那麼它會造成損壞.hocr
.hocr
要將所有頁面寫入一個文件中,您需要使用純文字。