读光产品文档

PDF识别识别接口文档

调用地址：https://generalpdf.market.alicloudapi.com/ocrservice/pdf
请求方式：POST
返回类型：JSON

请求参数(Body)：

{
  //文件数据：base64编码，要求base64编码后大小不超过100M，页数不超过20页，和url参数只能同时存在一个
  "fileBase64": "",
  //文件url地址：完整URL，URL长度不超过1024字节，URL对应的文件base64编码后大小不超过100M，页数不超过20页，和img参数只能同时存在一个
  "url": "",
  //是否需要识别结果中每一行的置信度，默认不需要。 true：需要 false：不需要
  "prob": false,
  //是否需要单字识别功能，默认不需要。 true：需要 false：不需要
  "charInfo": false,
  //是否需要自动旋转功能，默认不需要。 true：需要 false：不需要
  "rotate": false,
  //是否需要表格识别功能，默认不需要。 true：需要 false：不需要
  "table": false,
  //转文件类型，word
  "fileType":"word"
}

请求代码示例：

java版

    public static void main(String[] args) {
        String url = "https://generalpdf.market.alicloudapi.com/ocrservice/pdf";
        String appcode = "你自己的AppCode";
        HashMap<String, String> headers = new HashMap<String, String>();
        //最后在header中的格式(中间是英文空格)为Authorization:APPCODE 83359fd73fe94948385f570e3c139105
        headers.put("Authorization", "APPCODE " + appcode);
        //根据API的要求，定义相对应的Content-Type
        headers.put("Content-Type", "application/json; charset=UTF-8");
        String bodys = "{//文件数据：base64编码，要求base64编码后大小不超过100M，页数不超过20页，和url参数只能同时存在一个\"fileBase64\":\"\",//文件url地址：完整URL，URL长度不超过1024字节，URL对应的文件base64编码后大小不超过100M，页数不超过20页，和img参数只能同时存在一个\"url\":\"\",//是否需要识别结果中每一行的置信度，默认不需要。true：需要false：不需要\"prob\":false,//是否需要单字识别功能，默认不需要。true：需要false：不需要\"charInfo\":false,//是否需要自动旋转功能，默认不需要。true：需要false：不需要\"rotate\":false,//是否需要表格识别功能，默认不需要。true：需要false：不需要\"table\":false,//转文件类型，word\"fileType\":\"word\"}";
        try {
            /**
            * 重要提示如下:
            * HttpClientUtils请从
            * https://gitee.com/duguangdemo/publicclouddemo/blob/master/src/main/java/util/HttpClientUtils.java
            * 下载
            * HttpExecuteResponse请从
            * https://gitee.com/duguangdemo/publicclouddemo/blob/master/src/main/java/util/HttpExecuteResponse.java
            * 下载
            *
            * 相应的依赖请参照
            * https://gitee.com/duguangdemo/publicclouddemo/blob/master/pom.xml
            */
            HttpExecuteResponse response = HttpClientUtils.doPost(url,bodys, headers);
            System.out.println(response.getResponseAsString());
            System.out.println(response.toString());
//            需要检查response的headers信息时可用以下代码,方便排查问题用
//            for (Object json : response.getHeaders()) {
//                System.out.println(json);
//            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

c#版

//using System.IO;
//using System.Text;
//using System.Net;
//using System.Net.Security;
//using System.Security.Cryptography.X509Certificates;


        private const String host = "https://generalpdf.market.alicloudapi.com";
        private const String path = "/ocrservice/pdf";
        private const String method = "POST";
        private const String appcode = "你自己的AppCode";

        static void Main(string[] args)
        {
            String querys = "";
            String bodys = "{//文件数据：base64编码，要求base64编码后大小不超过100M，页数不超过20页，和url参数只能同时存在一个\"fileBase64\":\"\",//文件url地址：完整URL，URL长度不超过1024字节，URL对应的文件base64编码后大小不超过100M，页数不超过20页，和img参数只能同时存在一个\"url\":\"\",//是否需要识别结果中每一行的置信度，默认不需要。true：需要false：不需要\"prob\":false,//是否需要单字识别功能，默认不需要。true：需要false：不需要\"charInfo\":false,//是否需要自动旋转功能，默认不需要。true：需要false：不需要\"rotate\":false,//是否需要表格识别功能，默认不需要。true：需要false：不需要\"table\":false,//转文件类型，word\"fileType\":\"word\"}";
            String url = host + path;
            HttpWebRequest httpRequest = null;
            HttpWebResponse httpResponse = null;

            if (0 < querys.Length)
            {
                url = url + "?" + querys;
            }

            if (host.Contains("https://"))
            {
                ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                httpRequest = (HttpWebRequest)WebRequest.CreateDefault(new Uri(url));
            }
            else
            {
                httpRequest = (HttpWebRequest)WebRequest.Create(url);
            }
            httpRequest.Method = method;
            httpRequest.Headers.Add("Authorization", "APPCODE " + appcode);
            //根据API的要求，定义相对应的Content-Type
            httpRequest.ContentType = "application/json; charset=UTF-8";
            if (0 < bodys.Length)
            {
                byte[] data = Encoding.UTF8.GetBytes(bodys);
                using (Stream stream = httpRequest.GetRequestStream())
                {
                    stream.Write(data, 0, data.Length);
                }
            }
            try
            {
                httpResponse = (HttpWebResponse)httpRequest.GetResponse();
            }
            catch (WebException ex)
            {
                httpResponse = (HttpWebResponse)ex.Response;
            }

            Console.WriteLine(httpResponse.StatusCode);
            Console.WriteLine(httpResponse.Method);
            Console.WriteLine(httpResponse.Headers);
            Stream st = httpResponse.GetResponseStream();
            StreamReader reader = new StreamReader(st, Encoding.GetEncoding("utf-8"));
            Console.WriteLine(reader.ReadToEnd());
            Console.WriteLine("\n");

        }

        public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
        {
            return true;
        }

PHP版:

<?php
    $host = "https://generalpdf.market.alicloudapi.com";
    $path = "/ocrservice/pdf";
    $method = "POST";
    $appcode = "你自己的AppCode";
    $headers = array();
    array_push($headers, "Authorization:APPCODE " . $appcode);
    //根据API的要求，定义相对应的Content-Type
    array_push($headers, "Content-Type".":"."application/json; charset=UTF-8");
    $querys = "";
    $bodys = "{//文件数据：base64编码，要求base64编码后大小不超过100M，页数不超过20页，和url参数只能同时存在一个\"fileBase64\":\"\",//文件url地址：完整URL，URL长度不超过1024字节，URL对应的文件base64编码后大小不超过100M，页数不超过20页，和img参数只能同时存在一个\"url\":\"\",//是否需要识别结果中每一行的置信度，默认不需要。true：需要false：不需要\"prob\":false,//是否需要单字识别功能，默认不需要。true：需要false：不需要\"charInfo\":false,//是否需要自动旋转功能，默认不需要。true：需要false：不需要\"rotate\":false,//是否需要表格识别功能，默认不需要。true：需要false：不需要\"table\":false,//转文件类型，word\"fileType\":\"word\"}";
    $url = $host . $path;

    $curl = curl_init();
    curl_setopt($curl, CURLOPT_CUSTOMREQUEST, $method);
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
    curl_setopt($curl, CURLOPT_FAILONERROR, false);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_HEADER, true);
    if (1 == strpos("$".$host, "https://"))
    {
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
    }
    curl_setopt($curl, CURLOPT_POSTFIELDS, $bodys);
    var_dump(curl_exec($curl));
?>

Python2：

import urllib, urllib2, sys
import ssl


host = 'https://generalpdf.market.alicloudapi.com'
path = '/ocrservice/pdf'
method = 'POST'
appcode = '你自己的AppCode'
querys = ''
bodys = {}
url = host + path

bodys[''] = "{//文件数据：base64编码，要求base64编码后大小不超过100M，页数不超过20页，和url参数只能同时存在一个\"fileBase64\":\"\",//文件url地址：完整URL，URL长度不超过1024字节，URL对应的文件base64编码后大小不超过100M，页数不超过20页，和img参数只能同时存在一个\"url\":\"\",//是否需要识别结果中每一行的置信度，默认不需要。true：需要false：不需要\"prob\":false,//是否需要单字识别功能，默认不需要。true：需要false：不需要\"charInfo\":false,//是否需要自动旋转功能，默认不需要。true：需要false：不需要\"rotate\":false,//是否需要表格识别功能，默认不需要。true：需要false：不需要\"table\":false,//转文件类型，word\"fileType\":\"word\"}"
post_data = bodys['']
request = urllib2.Request(url, post_data)
request.add_header('Authorization', 'APPCODE ' + appcode)

request.add_header('Content-Type', 'application/json; charset=UTF-8')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
response = urllib2.urlopen(request, context=ctx)
content = response.read()
if (content):
    print(content)

Python3：

import urllib.request
import urllib.parse
import json
import time
import base64
with open('1.pdf', 'rb') as f:  # 以二进制读取本地图片
    data = f.read()
    encodestr = str(base64.b64encode(data),'utf-8')
#请求头
# 请修改为你自己的appcode，可从云市场订单或者api网关处获得
AppCode = "你自己的AppCode"
headers = {
    'Authorization': 'APPCODE ' + AppCode,
    'Content-Type': 'application/json; charset=UTF-8'
}

def posturl(url,data={}):
  try:
    params=json.dumps(dict).encode(encoding='UTF8')
    req = urllib.request.Request(url, params, headers)
    r = urllib.request.urlopen(req)
    html =r.read()
    r.close();
    return html.decode("utf8")
  except urllib.error.HTTPError as e:
      print(e.code)
      print(e.read().decode("utf8"))
  time.sleep(1)
if __name__=="__main__":
    url_request="https://generalpdf.market.alicloudapi.com/ocrservice/pdf"
    dict = {'fileBase64': encodestr,
222'fileType': 'word'}
    
    html = posturl(url_request, data=dict)
    print(html)

正常返回示例：

{
    // pdf文件总页数
    "totalPageNum": 1,
    //每页结果
    "pageResults": [
        {
            // pdf逻辑页码
            "index": 1,
            "ocrResult": {
                //行信息
                "prism_rowsInfo": [
                    {
                        "word": "MAi a cMRACNAS国际互认",
                        "rowId": 0
                    }
                ],
                // pdf转图片原始宽度
                "orgWidth": 1190,
                // 表格信息
                "prism_tablesInfo": [],
                // 当前页全文内容
                "content": "MAi a cMRACNAS国际互认",
                // 请求唯一id用于排查问题
                "sid": "07e0be3d2bef1962b328c1efe7fcf80b8c323a02ee797292300a97159a9b8f23e7769b77",
                "tableHeadTail": [],
                // 页信息
                "prism_pagesInfo": [
                    {
                        "pageId": 0,
                        "word": "MAi a cMRACNAS国际互认"
                    }
                ],
                "prism_wnum": 3,
                // pdf转图片宽度
                "width": 1190,
                // pdf转图片角度
                "angle": 0,
                // pdf转图片原始高度
                "orgHeight": 1684,
                "prism_version": "1.0.9",
                // pdf转图片字块信息
                "prism_wordsInfo": [
                    {
                        // 字块置信度
                        "prob": 99,
                        // 段落id
                        "paragraphId": 0,
                        // 页id
                        "pageId": 0,
                        "charInfo": [
                            {
                                // 单字置信度
                                "prob": 99,
                                "w": 61,
                                "h": 36,
                                "x": 178,
                                "y": 148,
                                // 单字内容
                                "word": "M"
                            },
                            {
                                "prob": 99,
                                "w": 49,
                                "h": 36,
                                "x": 246,
                                "y": 148,
                                "word": "A"
                            }
                        ],
                        // 行id
                        "rowId": 0,
                        // 字块坐标
                        "pos": [
                            {
                                "x": 178,
                                "y": 148
                            },
                            {
                                "x": 302,
                                "y": 148
                            },
                            {
                                "x": 302,
                                "y": 187
                            },
                            {
                                "x": 178,
                                "y": 187
                            }
                        ],
                        "width": 38,
                        "x": 221,
                        "angle": -90,
                        "y": 105,
                        "word": "MA",
                        "direction": 0,
                        "height": 125
                    }
                ],
                // 段落信息
                "prism_paragraphsInfo": [
                    {
                        "paragraphId": 0,
                        "word": "MA i a cMRA CNAS 国际互认"
                    }
                ],
                // pdf转图片高度
                "height": 1684
            }
        }
    ],
    // 导出word文件base64内容
    "fileBase64": ""
}

失败返回示例：

{
  "error_code": 400,
  "error_msg": "img和url参数不能同时存在"
}

错误码定义：

错误码	错误信息	描述
400	参数错误	具体错误请参考返回的error_msg
401	您无该功能的权限，请开通后使用	您无该功能的权限，请开通后使用
403	购买的容量已用完或者签名错误	购买的容量已用完或者签名错误
500	服务器错误，请稍后重试	服务器错误，请稍后重试