概述
基本思路:刚开始是通过提取关键词计算jaccard相似度的方法,来得出论文对在子空间上的相似度。关键词是用多种算法提取关键词(候选词)后进行综合而得到的。后面发现使用的方法大多和语义不太相关,所以又打算使用bert进行训练子空间上的句向量,然后不是计算jaccard相似度,而是计算cos等其他的相似度。
1.LSTM实现文本相似度:
def get_model(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH,
num_lstm, rate_drop_lstm, rate_drop_dense, num_dense, act):
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedding
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
embedded_sequences_1 = embedding_layer(sequence_1_input)
embedded_sequences_2 = embedding_layer(sequence_2_input)
# lstm
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
x1 = lstm_layer(embedded_sequences_1)
y1 = lstm_layer(embedded_sequences_2)
# classifier
merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input, sequence_2_input],
outputs=preds)
model.compile(loss='binary_crossentropy',
optimizer='nadam',
metrics=['acc'])
model.summary()
return model
- BiLSTM实现文本相似度
def get_model(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH,
num_lstm, rate_drop_lstm, rate_drop_dense, num_dense, act):
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
lstm_layer = Bidirectional(LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)
merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input, sequence_2_input],
outputs=preds)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['acc'])
model.summary()
return model
3.ESIM实现文本相似度
def get_model(embedding_matrix, nb_words, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, num_lstm, rate_drop_dense):
att1_layer = Attention.Attention(MAX_SEQUENCE_LENGTH)
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # 编码后的问题1的词特征
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # 编码后的问题2的词特征
# embedding
embedding_layer = Embedding(nb_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
embedded_sequences_1 = embedding_layer(sequence_1_input)
embedded_sequences_2 = embedding_layer(sequence_2_input)
# encode
lstm1_layer = Bidirectional(LSTM(num_lstm))
encode_sequences_1 = lstm1_layer(embedded_sequences_1)
encode_sequences_2 = lstm1_layer(embedded_sequences_2)
# lstm
lstm0_layer = LSTM(num_lstm, return_sequences=True)
lstm2_layer = LSTM(num_lstm)
v1ls = lstm2_layer(lstm0_layer(embedded_sequences_1))
v2ls = lstm2_layer(lstm0_layer(embedded_sequences_2))
v1 = Concatenate(axis=1)([att1_layer(embedded_sequences_1), encode_sequences_1])
v2 = Concatenate(axis=1)([att1_layer(embedded_sequences_2), encode_sequences_2])
# sequence_1c_input = Input(shape=(MAX_SEQUENCE_LENGTH_CHAR,), dtype='int32') # 编码后的问题1的字特征
# sequence_2c_input = Input(shape=(MAX_SEQUENCE_LENGTH_CHAR,), dtype='int32') # 编码后的问题2的字特征
# embedding_char_layer = Embedding(char_words,
# EMBEDDING_DIM)
# embedded_sequences_1c = embedding_char_layer(sequence_1c_input)
# embedded_sequences_2c = embedding_char_layer(sequence_2c_input)
# x1c = lstm1_layer(embedded_sequences_1c)
# x2c = lstm1_layer(embedded_sequences_2c)
# v1c = Concatenate(axis=1)([att1_layer(embedded_sequences_1c), x1c])
# v2c = Concatenate(axis=1)([att1_layer(embedded_sequences_2c), x2c])
# compose
mul = Multiply()([v1, v2])
sub = Lambda(lambda x: K.abs(x))(Subtract()([v1, v2]))
maximum = Maximum()([Multiply()([v1, v1]), Multiply()([v2, v2])])
# mulc = Multiply()([v1c, v2c])
# subc = Lambda(lambda x: K.abs(x))(Subtract()([v1c, v2c]))
# maximumc = Maximum()([Multiply()([v1c, v1c]), Multiply()([v2c, v2c])])
sub2 = Lambda(lambda x: K.abs(x))(Subtract()([v1ls, v2ls]))
# matchlist = Concatenate(axis=1)([mul, sub, mulc, subc, maximum, maximumc, sub2])
matchlist = Concatenate(axis=1)([mul, sub, maximum, sub2])
matchlist = Dropout(rate_drop_dense)(matchlist)
matchlist = Concatenate(axis=1)(
[Dense(32, activation='relu')(matchlist), Dense(48, activation='sigmoid')(matchlist)])
res = Dense(1, activation='sigmoid')(matchlist)
# model = Model(inputs=[sequence_1_input, sequence_2_input,
# sequence_1c_input, sequence_2c_input], outputs=res)
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=res)
model.compile(optimizer=Adam(lr=0.001), loss="binary_crossentropy", metrics=['acc'])
model.summary()
return model
4.Decomption + Attention实现文本相似度
def get_model(embedding_matrix_file, MAX_SEQUENCE_LENGTH,
rate_drop_projction, num_projction, hidden_projction,
rate_drop_compare, num_compare,
rate_drop_dense, num_dense):
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedding
embedding_layer = create_pretrained_embedding(embedding_matrix_file, mask_zero=False)
embedded_sequences_1 = embedding_layer(sequence_1_input)
embedded_sequences_2 = embedding_layer(sequence_2_input)
# projection
projection_layers = []
if hidden_projction > 0:
projection_layers.extend([
Dense(hidden_projction, activation='elu'),
Dropout(rate=rate_drop_projction),
])
projection_layers.extend([
Dense(num_projction, activation=None),
Dropout(rate=rate_drop_projction),
])
encode_sequences_1 = time_distributed(embedded_sequences_1, projection_layers)
encode_sequences_2 = time_distributed(embedded_sequences_2, projection_layers)
# attention
alignd_sequences_1, alignd_sequences_2 = soft_attention_alignment(encode_sequences_1, encode_sequences_2)
# compare
combined_sequences_1 = Concatenate()(
[encode_sequences_1, alignd_sequences_2, submult(encode_sequences_1, alignd_sequences_2)])
combined_sequences_2 = Concatenate()(
[encode_sequences_2, alignd_sequences_1, submult(encode_sequences_2, alignd_sequences_1)])
compare_layers = [
Dense(num_compare, activation='elu'),
Dropout(rate_drop_compare),
Dense(num_compare, activation='elu'),
Dropout(rate_drop_compare),
]
compare_sequences_1 = time_distributed(combined_sequences_1, compare_layers)
compare_sequences_2 = time_distributed(combined_sequences_2, compare_layers)
# aggregate
rep_sequences_1 = apply_multiple(compare_sequences_1, [GlobalAvgPool1D(), GlobalMaxPool1D()])
rep_sequences_2 = apply_multiple(compare_sequences_2, [GlobalAvgPool1D(), GlobalMaxPool1D()])
# classifier
merged = Concatenate()([rep_sequences_1, rep_sequences_2])
dense = BatchNormalization()(merged)
dense = Dense(num_dense, activation='elu')(dense)
dense = Dropout(rate_drop_dense)(dense)
dense = BatchNormalization()(dense)
dense = Dense(num_dense, activation='elu')(dense)
dense = Dropout(rate_drop_dense)(dense)
out_ = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=out_)
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['binary_crossentropy', 'accuracy'])
return model
最后
以上就是外向楼房为你收集整理的判断论文对在子空间上的相似度(5)的全部内容,希望文章能够帮你解决判断论文对在子空间上的相似度(5)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复