import pandas as pd# https://zenodo.org/record/2783642q_df = pd.read_csv("Questions.csv")a_df = pd.read_csv("Answers.csv")print("q_df.shape:", q_df.shape)print("a_df.shape:", a_df.shape)q_df.columns = [c.strip() for c in q_df.columns]a_df.columns = [c.strip() for c in a_df.columns]df = q_df.merge(a_df, on="AID")df.columns = ["query","AID","document"]
import requests# download japanese stopwordsurl ="https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"stopwords = requests.get(url).text.split("\n")import MeCab# parser = MeCab.Tagger("-Owakati")def extract_nouns_verbs(text): parser = MeCab.Tagger() parsed_text = parser.parse(text) lines = parsed_text.split('\n') nouns_verbs = []for line in lines:if'名詞'in line or'動詞'in line: parts = line.split('\t') word = parts[0]ifnot word.isascii(): nouns_verbs.append(word)return nouns_verbsdef preprocess(text):return [token for token in extract_nouns_verbs(text) if token notin stopwords]tokenized_texts = [preprocess(text) for text in a_df["Text"]] from rank_bm25 import BM25Okapibm25 = BM25Okapi(tokenized_texts)
# for query, aid, embedding in query_list:# tokenized_query = parser.parse(query).strip().split()# rank_result = bm25.get_top_n(tokenized_query,a_df.AID , n=DOC_NUM)def get_query_result_rank(txt, aid, vectorstore): search_result = vectorstore.similarity_search(txt, k=DOC_NUM) rank_result = [r.metadata["AID"] for r in search_result]return rank_result.index(aid) +1def get_query_result_rank_bm25(txt, aid, bm25): tokenized_query = preprocess(txt) rank_result = bm25.get_top_n(tokenized_query,a_df.AID , n=DOC_NUM)return rank_result.index(aid) +1
from langchain.embeddings import HuggingFaceEmbeddingsembeddings = HuggingFaceEmbeddings(model_name="distiluse-base-multilingual-cased-v2")
:
:
# save embeddings to pickleimport picklewithopen("embeddings.pkl", "wb") as f: pickle.dump(embeddings, f)
from langchain.embeddings import TensorflowHubEmbeddingsembedding_tfhub = TensorflowHubEmbeddings()
query_result = embedding_tfhub.embed_query("日本語")
Collecting tensorflow-hub
Downloading tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.6/100.6 kB 6.3 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.12.0 in /Users/dayuan.jiang/opt/anaconda3/envs/notion-db/lib/python3.9/site-packages (from tensorflow-hub) (1.23.5)
Requirement already satisfied: protobuf>=3.19.6 in /Users/dayuan.jiang/opt/anaconda3/envs/notion-db/lib/python3.9/site-packages (from tensorflow-hub) (3.20.3)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.13.0
Note: you may need to restart the kernel to use updated packages.
import torch
import tensorflow_hub
conda install nomkl
Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done
## Package Plan ##
environment location: /Users/dayuan.jiang/opt/anaconda3/envs/notion-db
added / updated specs:
- nomkl
The following packages will be downloaded:
package | build
---------------------------|-----------------
blas-1.0 | openblas 45 KB
certifi-2023.5.7 | py39hecd8cb5_0 153 KB
nomkl-3.0 | 0 45 KB
------------------------------------------------------------
Total: 243 KB
The following NEW packages will be INSTALLED:
blas pkgs/main/osx-64::blas-1.0-openblas
nomkl pkgs/main/osx-64::nomkl-3.0-0
The following packages will be UPDATED:
certifi 2022.12.7-py39hecd8cb5_0 --> 2023.5.7-py39hecd8cb5_0
openssl 1.1.1s-hca72f7f_0 --> 1.1.1t-hca72f7f_0
Downloading and Extracting Packages
blas-1.0 | 45 KB | | 0%
certifi-2023.5.7 | 153 KB | | 0%
blas-1.0 | 45 KB | ##################################### | 100%
certifi-2023.5.7 | 153 KB | ##################################### | 100%
certifi-2023.5.7 | 153 KB | ##################################### | 100%
nomkl-3.0 | 45 KB | #############1 | 35%
nomkl-3.0 | 45 KB | ##################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Note: you may need to restart the kernel to use updated packages.