场景英文
ocr_generate.py
import numpy as np
import os
import glob
from PIL import Image
# ord函数:将字符转化成数字
# chr函数:将数字转化成字符
# charactersNo字典:a~A,A~Z,0~10,".,?\'-:;!/\"<>&(+"的key值分别对应0-25,,2-51,52-61,62....
# characters列表:储存的是charactersNo字典的key
# 建立charactersNo字典的意思是:为了将之后手写体对应的txt文件中的句子转换成数字编码,便于存储和运算求距离
charactersNo = {}
characters = []
length = []
for i in range(26):
charactersNo[chr(ord('a') + i)] = i
characters.APPend(chr(ord('a') + i)) # 在列表末尾添加新对象
for i in range(26):
charactersNo[chr(ord('A') + i)] = i + 26
characters.append(chr(ord('A') + i))
for i in range(10):
charactersNo[chr(ord('0') + i)] = i + 52
characters.append(chr(ord('0') + i))
punctuations = ".,?\'-:;!/\"<>&()+"
for p in punctuations:
charactersNo[p] = len(charactersNo) # 字典中元素个数
characters.append(p)
# 读取了train_img和train_txt文件夹下的所有文件的读取路径
# 下面代码的作用是:
# Img:列表结构 存储的是手写的英文图片
# Y:数组结构 存储的是图片对应txt文件中的句子,只不过存储的是字符转码后的数字
# length:数组结构 存储的是图片对应的txt文件中句子含有字符的数量
def get_data():
imgfiles = glob.glob(os.path.join("train_img", "*")) # 用法os.path.join(path1, path2, ...)返回:多个路径拼接组合
imgfiles.sort() #
txtfiles = glob.glob(
os.path.join("train_txt", "*")) # path = os.path.join(data_dir,'*.txt') , files = glob.glob(path)
txtfiles.sort()
Imgs = []
Y = []
length = []
for j in range(len(imgfiles)):
fin = open(txtfiles[i])
line = fin.readlines() # readlines():读取整个文件所有行,保存在一个列表(list)变量中,每行作为一个元素,
line = line[0]
fin.close()
y = np.asarray([0] * (len(line))) # 当输入是列表的时候,将输入转为矩阵格式,更改列表的值并不会影响转化为矩阵的值。
succ = True # 当输入为数组的时候,将输入转为矩阵格式,但np.asarray的输出在变化。
for k in range(len(line)):
if line[j] not in charactersNo:
succ = False
break
y[j] = charactersNo[line[j]]
if not succ: # 判断succ是否为假
continue
Y.append(y)
length.append(length(line))
im = Image.open(imgfiles)
width, height = im.size
im = im.conver("L")
Imgs.append(im)
print("train:", len(Imgs), len(Y))
Y = np.asarray(Y)
length = np.asarray(length)
return Imgs, Y
ocr_forward.py
import os
import glob
import random
import numpy as np
import tensorflow as tf
from PIL import Image
from PIL import ImageFile
import ocr_generated
conv1_filter = 32
conv2_filter = 64
conv3_filter = 128
conv4_filter = 256
# 参数初始化,并且对w进行正则化处理,防止模型过拟合
def get_weight(shape, regularizer):
w = tf.Variable(tf.truncated_normal((shape), stddev=0.1, dtype=tf.float32))
if regularizer != None:
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))
return w
# 参数b初始化
def get_bias(shape):
b = tf.Variable(tf.constant(0., shape=shape, dtype=tf.float32))
return b
# 第一层卷积函数tf.nn.conv2d
def conv2d(x, w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
# 池化层函数,在池化层采用最大池化、有效的提供特征
def max_pool_2x2(x, kernel_size):
return tf.nn.max_pool(x, ksize=kernel_size, strides=kernel_size, padding='valid')
# 前向传播中共使用了四层神经网络
# 第一层卷积层和池化层实现
def forward(x, train, regularizer):
conv1_w = get_weight([3, 3, 1, conv1_filter], regularizer)
conv1_b = get_bias([conv1_filter])
conv1 = conv2d(x, conv1_w)
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_b))
pool1 = max_pool_2x2(relu1, [1, 2, 2, 1])
# 通过keep_prob参数控制drop_out函数对神经元的筛选
if train:
keep_prob = 0.6 # 防止过拟合
else:
keep_prob = 1.0
# 第二层卷积层和池化层
conv2_w = get_weight([5, 5, conv1_filter, conv2_filter], regularizer)
conv2_b = get_bias([conv2_filter])
conv2 = conv2d(tf.nn.dropout(pool1, keep_prob), conv2_w)
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_b))
pool2 = max_pool_2x2(relu2, [1, 2, 2, 1])
# 第三层卷积层和池化层
conv3_w = get_weight([5, 5, conv2_filter, conv3_filter], regularizer)
conv3_b = get_bias([conv3_filter])
conv3 = conv2d(tf.nn.dropout(pool2, keep_prob), conv3_w)
relu3 = tf.nn.relu(tf.nn.bias_add(conv3, conv3_b))
pool3 = max_pool_2x2(relu3, [1, 2, 2, 1])
# 第四层卷积层和池化层
conv4_w = get_weight([5, 5, conv3_filter, conv4_filter], regularizer)
conv4_b = get_bias([conv4_filter])
conv4 = conv2d(tf.nn.dropout(pool3, keep_prob), conv4_w)
relu4 = tf.nn.relu(tf.nn.bias_add(conv4, conv4_b))
pool4 = max_pool_2x2(relu4, [1, 7, 1, 1])
run_inputs = tf.reshape(tf.nn.dropout(pool4, keep_prob), [-1, 256, conv4_filter])
num_hidden = 512
num_classes = len(ocr_generated.charactersNo) + 1
W = tf.Variable(tf.truncated_normal([num_hidden, num_classes], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0, shape=[num_classes]), name="b")
# 前向传播、反向传播,利用双向LSTM实时记忆循环网络,生成以下两个单元
cell_fw = tf.nn.rnn_cell.LSTMCell(num_hidden >> 1, state_is_tuple=True)
cell_bw = tf.nn.rnn_cell.LSTMCell(num_hidden >> 1, state_is_tuple=True)
# 将两个单元传入动态rnn中
outputs_fw_bw, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, run_inputs, dtype = tf.float32)
# tf.contat链接前向和后向得到的结果,在指定维度上进行连接
outputs1 = tf.concat(outputs_fw_bw, 2)
shape = tf.shape(x)
BATch_s, max_timesteps = shape[0], shape[1]
outputs = tf.reshape(outputs1, [-1, num_hidden])
# 全连接层实现
logits0 = tf.matmul(tf.nn.dropout(outputs, keep_prob), W) + b
logits1 = tf.reshape(logits0, [batch_s, -1, num_classes])
logits = tf.transpose(logits1, (1, 0, 2))
y = tf.cast(logits, tf.float32)
return y
orc_backward.py
import os
import glob
import random
import numpy as np
import tensorflow as tf
from PIL import Image
from PIL import ImageFilter
import ocr_forward
import ocr_generated
REGULARIZER = 0.0001
GraphSize = (112, 1024)
MODEL_SAVE_PATH = "./model/"
MODEL_NAME = "ocr_model"
def transform(im, flag=True): ...
def create_sparse(Y, dtype=np.int32): ...
def backward():
x = tf.placeholder(tf.float32, shape=[None, GraphSize[0], GraphSize[1], 1])
y = ocr_forward.forward(x, True, REGULARIZER)
global_step = tf.Variable(0, trainable=False) # 全局步骤计数
seq_len = tf.placeholder(tf.int32, shape=[None])
y_ = tf.sparse_placeholder(tf.int32, shape=[None, 2])
Imgs, Y = ocr_generated.get_data()
# 损失函数使用的ctc_loss()函数
loss = tf.nn.ctc_loss(y_, y, seq_len)
cost = tf.reduce_mean(loss)
# 油画函数使用的是Adam函数
optimizer1 = tf.train.AdamOptimizer(learning_rate=0.003).Minimize(cost, global_step=global_step)
optimizer2 = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost, global_step=global_step)
width1_decoded, width1_log_prob = tf.nn.ctc_beam_search_decoder(y, seq_len, merge_repeated=False, beam_width=1)
decoded, log_prob = tf.nn.ctc_beam_search_decoder(y, seq_len, merge_repeated=False)
width_acc = tf.reduce_mean(tf.edit_distance(tf.cast(width1_decoded[0], tf.int32), y_))
acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), y_))
nBatchArray = np.arange(Y.shape[0])
epoch = 100
batchSize = 32
saver = tf.train.Saver(max_to_keep=1)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.session(config=config)
bestDevErr = 100.0
with sess:
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
for ep in range(epoch):
np.random.shuffle(nBatchArray)
for i in range(0, Y.shape[0], batchSize):
batch_output = create_sparse(Y[nBatchArray[i:i + batchSize]])
X = [None] * min(Y.shape[0] - i, batchSize)
for j in range(len(X)):
X[j] = transform(Imgs[nBatchArray[i + j]])
feed_dict = {x: X, seq_len: np.ones(min(Y.shape[0] - i, batchSize)) * 256, y_: batch_output}
if ep < 50:
sess.run(optimizer1, feed_dict=feed_dict)
else:
sess.run(optimizer2, feed_dict=feed_dict)
print(ep, i, "loss:", tf.reduce_mean(loss.eval(feed_dict=feed_dict)).eval(), "err:",
tf.reduce_mean(width1_decoded))
saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME))
def main():
backward()
if __name__ == '__main__':
main()
ocr_text.py
import os
import glob
import random
import numpy as np
import tensorflow as tf
from PIL import Image
from PIL import ImageFilter
import ocr_forward
REGULARIZER = 0.0001
GraphSize = (112, 1024)
def transform(im,flag=True):...
def countMargin(v,minSum,direction=True):...
def splitLine(seg,dataSum,h,maxHeight):...
def getLine(im,data,upperbound=8,lowerbound=25,threshold=30,h=40,minHeight=35,maxHeight=120,beginX=20,endX=-20,):
def calEditDistance(text1,text2):
dp=np.asarray([0]*(len(text1)+1)*(len(text2)+1)).reshape(len())
相关阅读
人工智能之Python人脸识别技术--face_recognition模块
Github项目地址:https://github.com/MiChongGET/face_collection 一、环境搭建 1.系统环境 Ubuntu 17.04 Python 2.7.14 pyc
转自知乎:https://zhuanlan.zhihu.com/p/54156009 原文标题:识文精灵(ocrwiz):如何把在线文字识别(图片转文字)OCR的互动体验提升到极致
忽悠并不总是出于恶意的,而是每个人都可能做出的事情。忽悠的本质并不一定是欺诈,而是内心的欲望超出了自身对事情的把控能力,因此便
注:图片来源网络。 双指针。参见Minimum Window Substring。 #include<iostream> #include<vector> #include<string> #include<a
近日网上曝光了一起“704校花”校园贷事件:贷款公司故意不提醒还款或制造问题让学生无法还款,以产生高额逾期费;之后再用公开欠款信