面向文摘的中药方剂与疾病关系抽取研究
python 句法解析提取特征,构建svm模型预测关系 代码
import nltk import numpy as np from sklearn import svm from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report # 句法解析和特征提取 def extract_features(sentence): # 使用NLTK进行句法解析 parser = nltk.ChartParser(nltk.data.load('grammars/book_grammars/feat0.fcfg')) for tree in parser.parse(sentence.split()): # 提取特征 features = {} for subtree in tree.subtrees(): if subtree.label() == 'S': continue # 获取依存关系和关系中的词 relation = subtree.label() words = ' '.join(subtree.leaves()) features[words] = relation return features # 加载训练数据 def load_data(): # 根据实际情况加载训练数据,这里以示例数据为例 sentences = ['John loves Mary', 'Mary hates John'] labels = ['loves', 'hates'] return sentences, labels # 构建特征向量和标签 def build_features_and_labels(sentences, labels): features = [] target_labels = [] for sentence, label in zip(sentences, labels): sentence_features = extract_features(sentence) features.append(sentence_features) target_labels.append(label) return features, target_labels # 主函数 def main(): # 加载训练数据 sentences, labels = load_data() # 构建特征向量和标签 features, target_labels = build_features_and_labels(sentences, labels) # 特征向量转换为数值表示 vectorizer = DictVectorizer(sparse=False) X = vectorizer.fit_transform(features) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, target_labels, test_size=0.2, random_state=42) # 构建SVM模型 clf = svm.SVC(kernel='linear') # 拟合模型 clf.fit(X_train, y_train) # 预测测试集 y_pred = clf.predict(X_test) # 输出分类报告 print(classification_report(y_test, y_pred)) if __name__ == '__main__': main()
python 代码 Word2Vec 训练的词向量,构建的 LSTM 模型
LSTM 是 RNN 的一种变体,旨在解决传统 RNN 面临的长期依赖问题。长期依赖是指在处理长序列时,传统 RNN 很难有效地捕捉到很早之前的信息。LSTM 通过引入门控机制(输入门、遗忘门和输出门)来控制信息的流动,从而有效地记忆和利用先前的信息。这使得 LSTM 在处理长序列数据时更加强大,并被广泛应用于机器翻译、语音识别和文本生成等任务。
import gensim import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense # 训练Word2Vec词向量 def train_word2vec(sentences, embedding_dim): # 构建Word2Vec模型 model = gensim.models.Word2Vec(sentences, size=embedding_dim, min_count=1) # 获取词汇表和词向量 vocabulary = model.wv.vocab embeddings = np.zeros((len(vocabulary), embedding_dim)) for i, word in enumerate(vocabulary): embeddings[i] = model.wv[word] return vocabulary, embeddings # 构建LSTM模型 def build_lstm_model(embedding_dim, hidden_units): model = Sequential() model.add(Embedding(embeddings.shape[0], embedding_dim, input_length=max_sequence_length, weights=[embeddings], trainable=False)) model.add(LSTM(hidden_units)) model.add(Dense(1, activation='sigmoid')) return model # 主函数 def main(): # 准备训练数据 sentences = [ ['I', 'love', 'this', 'movie'], ['This', 'is', 'great'], ['The', 'plot', 'is', 'boring'] ] labels = [1, 1, 0] # 设置词向量维度和LSTM隐藏单元数 embedding_dim = 100 hidden_units = 64 # 训练Word2Vec词向量 vocabulary, embeddings = train_word2vec(sentences, embedding_dim) # 构建LSTM模型 model = build_lstm_model(embedding_dim, hidden_units) # 编译模型 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # 将训练数据转换为索引序列 tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(sentences) sequences = tokenizer.texts_to_sequences(sentences) max_sequence_length = max(len(sequence) for sequence in sequences) padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_length) # 拟合模型 model.fit(padded_sequences, labels, epochs=10, batch_size=1) # 预测新数据 test_sentences = [ ['This', 'is', 'amazing'], ['I', 'dislike', 'it'] ] test_sequences = tokenizer.texts_to_sequences(test_sentences) padded_test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_sequence_length) predictions = model.predict_classes(padded_test_sequences) # 打印预测结果 for sentence, prediction in zip(test_sentences, predictions): print(sentence, '->', prediction[0]) if __name__ == '__main__': main()