# https://gpt-index.readthedocs.io/en/latest/guides/tutorials/terms_definitions_tutorial.html#improvement-2-refining-better-prompts
from dotenv import load_dotenv
load_dotenv()
True
import pandas as pd
# https://zenodo.org/record/2783642
q_df = pd.read_csv("Questions.csv")
a_df = pd.read_csv("Answers.csv")
print("q_df.shape:", q_df.shape)
print("a_df.shape:", a_df.shape)
q_df.columns = [c.strip() for c in q_df.columns]
a_df.columns = [c.strip() for c in a_df.columns]
df = q_df.merge(a_df, on="AID")
df.columns = ["query","AID","document"]
q_df.shape: (427, 2)
a_df.shape: (79, 2)
display(q_df.head(1))
display(a_df.head(1))
display(df.head(1))
Text AID
0 履修している授業で先生が資料をアップロードしているはずだが、コース上に資料が見当たらない。 A001
AID Text
0 A001 資料が見つからない場合は、以下の点を確認してください。<br><br><br>【受講生編】<...
query AID document
0 履修している授業で先生が資料をアップロードしているはずだが、コース上に資料が見当たらない。 A001 資料が見つからない場合は、以下の点を確認してください。<br><br><br>【受講生編】<...

テストケース - 違うVectorstore - 違うEmbedding手法 - 違うライブラリー

from langchain.vectorstores import FAISS, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from tqdm.auto import tqdm
embedding = OpenAIEmbeddings()
# raw_doc_embeddings = embedding.embed_documents(a_df["Text"])
raw_query_embeddings = embedding.embed_documents(q_df["Text"])
metadata = a_df[["AID"]].to_dict(orient="records")
faiss_vectorstore = FAISS.from_texts(
    texts = a_df["Text"].tolist(),
    embedding=embedding,
    metadatas=metadata,
    )

chroma_vectorstore = Chroma.from_texts(
    texts = a_df["Text"].tolist(),
    embedding=embedding,
    metadatas = metadata,
)
Using embedded DuckDB without persistence: data will be transient
query_list = list(zip(q_df["Text"], q_df["AID"]))
DOC_NUM = len(a_df)
def get_query_result_rank(txt, aid, vectorstore):
    search_result = vectorstore.similarity_search(txt, k=DOC_NUM)
    rank_result = [r.metadata["AID"] for r in search_result]
    return rank_result.index(aid) + 1

def mrr(df):
    return (1 / df["rank"]).mean()

def recall(df, k):
    return (df["rank"] <= k).mean()

def evaluate(query_list, vectorstore, get_rank_func=get_query_result_rank):
    result_list = []
    for query, aid, query_embedding in tqdm(query_list):
        rank = get_rank_func(query, aid, vectorstore)
        result_list.append((query, rank))
    result_df = pd.DataFrame(result_list, columns=["query", "rank"])
    return result_df, mrr(result_df), recall(result_df, 1), recall(result_df, 5)


faiss_result = evaluate(query_list, faiss_vectorstore)
chrome_result = evaluate(query_list, chroma_vectorstore)
print(faiss_result[1:])
print(chrome_result[1:])
(0.6853719607667331, 0.550351288056206, 0.8665105386416861)
(0.6853569484566387, 0.550351288056206, 0.8665105386416861)
from llama_index import GPTVectorStoreIndex, Document
documents = [Document(text=row["Text"], extra_info={"AID": row["AID"]}) for _, row in a_df.iterrows()]
vec_store = GPTVectorStoreIndex.from_documents(documents)
query_engine = vec_store.as_retriever(similarity_top_k=DOC_NUM)
def get_query_result_rank_lmidx(text, aid, vectorstore):
    search_result = vectorstore.retrieve(text)
    rank_result = [r.node.extra_info["AID"] for r in search_result]
    return rank_result.index(aid) + 1
lmidx_result = evaluate(query_list, query_engine, get_query_result_rank_lmidx)
print(faiss_result[1:])
print(chrome_result[1:])
print(lmidx_result[1:])
(0.6853719607667331, 0.550351288056206, 0.8665105386416861)
(0.6853569484566387, 0.550351288056206, 0.8665105386416861)
(0.6862287549832872, 0.550351288056206, 0.8501170960187353)
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch(
elasticsearch_url="http://localhost:9200",
index_name="test_index",
embedding=embedding
)
elastic_vectostore = ElasticVectorSearch.from_texts(
    texts = a_df["Text"].tolist(),
    embedding=embedding,
    metadatas = metadata,
    elasticsearch_url="http://localhost:9200",
)
elastic_result = evaluate(query_list, elastic_vectostore, get_query_result_rank)
print(faiss_result[1:])
print(chrome_result[1:])
print(lmidx_result[1:])
print(elastic_result[1:])
(0.6853719607667331, 0.550351288056206, 0.8665105386416861)
(0.6853569484566387, 0.550351288056206, 0.8665105386416861)
(0.6862287549832872, 0.550351288056206, 0.8501170960187353)
(0.6853931924624378, 0.550351288056206, 0.8688524590163934)
import requests
# download japanese stopwords
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"
stopwords = requests.get(url).text.split("\n")

import MeCab
# parser = MeCab.Tagger("-Owakati")
def extract_nouns_verbs(text):
    parser = MeCab.Tagger()
    parsed_text = parser.parse(text)
    lines = parsed_text.split('\n')
    nouns_verbs = []

    for line in lines:
        if '名詞' in line or '動詞' in line:
            parts = line.split('\t')
            word = parts[0]
            if not word.isascii():
                nouns_verbs.append(word)
    return nouns_verbs

def preprocess(text):
    return [token for token in extract_nouns_verbs(text) if token not in stopwords]

tokenized_texts = [preprocess(text) for text in a_df["Text"]] 

from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(tokenized_texts)
# for query, aid, embedding in query_list:
#     tokenized_query = parser.parse(query).strip().split()
#     rank_result = bm25.get_top_n(tokenized_query,a_df.AID , n=DOC_NUM)
    
def get_query_result_rank(txt, aid, vectorstore):
    search_result = vectorstore.similarity_search(txt, k=DOC_NUM)
    rank_result = [r.metadata["AID"] for r in search_result]
    return rank_result.index(aid) + 1

def get_query_result_rank_bm25(txt, aid, bm25):
    tokenized_query =  preprocess(txt)
    rank_result = bm25.get_top_n(tokenized_query,a_df.AID , n=DOC_NUM)
    return rank_result.index(aid) + 1
bm25_result = evaluate(query_list, bm25, get_query_result_rank_bm25)
print(faiss_result[1:])
print(chrome_result[1:])
print(lmidx_result[1:])
print(elastic_result[1:])
print(bm25_result[1:])
(0.6853719607667331, 0.550351288056206, 0.8665105386416861)
(0.6853569484566387, 0.550351288056206, 0.8665105386416861)
(0.6862287549832872, 0.550351288056206, 0.8501170960187353)
(0.6853931924624378, 0.550351288056206, 0.8688524590163934)
(0.6018858908976351, 0.45901639344262296, 0.7892271662763466)
print(faiss_result[1:])
print(chrome_result[1:])
print(lmidx_result[1:])
print(elastic_result[1:])
print(bm25_result[1:])
(0.6853719607667331, 0.550351288056206, 0.8665105386416861)
(0.6853569484566387, 0.550351288056206, 0.8665105386416861)
(0.6862287549832872, 0.550351288056206, 0.8501170960187353)
(0.6853931924624378, 0.550351288056206, 0.8688524590163934)
(0.5582001979359493, 0.4379391100702576, 0.7002341920374707)
print((faiss_result[0]["rank"] != chrome_result[0]["rank"]).sum())
print((faiss_result[0]["rank"] != lmidx_result[0]["rank"]).sum())
print((faiss_result[0]["rank"] != elastic_result[0]["rank"]).sum())
print((faiss_result[0]["rank"] != bm25_result[0]["rank"]).sum())
1
153
3
265
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="distiluse-base-multilingual-cased-v2")
: 
: 
# save embeddings to pickle
import pickle
with open("embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
from langchain.embeddings import TensorflowHubEmbeddings

embedding_tfhub = TensorflowHubEmbeddings()
query_result = embedding_tfhub.embed_query("日本語")
Collecting tensorflow-hub
  Downloading tensorflow_hub-0.13.0-py2.py3-none-any.whl (100 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.6/100.6 kB 6.3 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.12.0 in /Users/dayuan.jiang/opt/anaconda3/envs/notion-db/lib/python3.9/site-packages (from tensorflow-hub) (1.23.5)
Requirement already satisfied: protobuf>=3.19.6 in /Users/dayuan.jiang/opt/anaconda3/envs/notion-db/lib/python3.9/site-packages (from tensorflow-hub) (3.20.3)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.13.0
Note: you may need to restart the kernel to use updated packages.
import torch
import tensorflow_hub
conda install nomkl
Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/dayuan.jiang/opt/anaconda3/envs/notion-db

  added / updated specs:
    - nomkl


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    blas-1.0                   |         openblas          45 KB
    certifi-2023.5.7           |   py39hecd8cb5_0         153 KB
    nomkl-3.0                  |                0          45 KB
    ------------------------------------------------------------
                                           Total:         243 KB

The following NEW packages will be INSTALLED:

  blas               pkgs/main/osx-64::blas-1.0-openblas 
  nomkl              pkgs/main/osx-64::nomkl-3.0-0 

The following packages will be UPDATED:

  certifi                          2022.12.7-py39hecd8cb5_0 --> 2023.5.7-py39hecd8cb5_0 
  openssl                                 1.1.1s-hca72f7f_0 --> 1.1.1t-hca72f7f_0 



Downloading and Extracting Packages
blas-1.0             | 45 KB     |                                       |   0% 
certifi-2023.5.7     | 153 KB    |                                       |   0% 

blas-1.0             | 45 KB     | ##################################### | 100% 
certifi-2023.5.7     | 153 KB    | ##################################### | 100% 
certifi-2023.5.7     | 153 KB    | ##################################### | 100% 

nomkl-3.0            | 45 KB     | #############1                        |  35% 

nomkl-3.0            | 45 KB     | ##################################### | 100% 

                                                                                
                                                                                

                                                                                
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.