defget_file_content(filePath): withopen(filePath, 'rb') as fp: return base64.b64encode(fp.read())
# 发起API请求,获取识别结果
defocr_image(image_path): try: # 访问百度OCR API获取识别结果 response = requests.post( ocr_url, headers=headers, params={'access_token': access_token}, data={'image': get_file_content(image_path)} ) if response.status_code == 200: # 解析JSON格式返回值 result = json.loads(response.content) if'words_result'in result: return"\n".join([item['words'] for item in result['words_result']]) else: returnNone else: returnNone except Exception as e: print(e) returnNone
# 将识别结果写入输出文件
defwrite_output(output_path, text): try: withopen(output_path, 'w', encoding='utf-8') as f: f.write(text) except Exception as e: print(e)
defremove_extra_newlines(text): p = r'([。.??!!;;::])\n+' text = re.sub(p, r'\1\n', text) p = r'([^。.??!!;;::])\n+' text = re.sub(p, r'\1', text) return text.strip()