import pandas as pd
import torch
import os
import faiss
import numpy as np
import pickle
import openai
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from openai import OpenAI
from torch.nn.functional import softmax
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f">>We are using {device} device.<<")

>>We are using cuda device.<<

df = pd.read_csv('nahidorg_files/news_articles.csv')
df['word_count'] = df['article'].astype(str).apply(lambda x: len(x.split()))

df_filtered = df[df['word_count'] <= 512].copy() 
df_filtered = df_filtered.reset_index(drop=True)

df_filtered['article'][57]

' (CNN)Joe Biden has been president for a little over a year. And that year was not kind to him. In a new NPR/PBS/Marist College poll, more than half -- 56% -- of Americans said that Biden\'s first year in office was a "failure," while just 39% described it as a success. The news doesn\'t get better the more you dig into the survey. Two-thirds of independents said Biden\'s first year was a failure, while more than 9 in 10 Republicans (91%) agreed with that assessment.Read More Biden\'s numbers are better among Democrats -- 80% called year one a success -- but 15% of members of his own party described his first year in office as a failure. Now, asking such a binary question -- either Biden\'s first year was a success or a failure, with no room in the middle -- does tend to strip any nuance from issue. There are incredible complexities that go into assessing how a president has done.  Oftentimes, a president is judged in one way during his time in office and in another after he leaves, once the impacts of his policies come into clearer focus. That said, elections tend to force voters to think in this all-or-nothing way. Either you vote for a Democrat or for a Republican. Either you vote to re-elect your incumbent or you choose the challenger.Seen through that political lens, these poll numbers are extremely problematic for Democrats on the ballot this fall. We know that, historically, the first midterm election of a president\'s term is a referendum on his time in office up to that point. The Point: If the public\'s report card on Biden\'s second year in office is anything like the one for his first year, Democrats can kiss their House and Senate majorities goodbye.'

df_filtered.to_csv('nahidOrg_files/news_articles_filtered.csv')

tokenizer_for_embeddings = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model_for_embeddings = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)

def nahidOrg_generate_embeddings(text):
    inputs = tokenizer_for_embeddings(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) 
    with torch.no_grad():
        outputs = model_for_embeddings(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() 

articles = df_filtered['article'].tolist()
article_embeddings = []
for doc in tqdm(articles, desc="Generating Embeddings For All Articles:"):
    article_embeddings.append(nahidOrg_generate_embeddings(doc))

Generating Embeddings For All Articles:: 100%|██████████| 1696/1696 [00:06<00:00, 262.87it/s]

print(f"Dimension: {len(article_embeddings[57])} \nVectors: {article_embeddings[57]}" )

Dimension: 384 
Vectors: [-2.01892834e-02 -1.14912458e-01  1.79007471e-01 -4.92094830e-02
 -5.30712046e-02  7.04352325e-03 -6.71370029e-02  2.57717613e-02
  2.37255320e-02 -9.33224242e-03 -8.10500458e-02  7.36312866e-02
  1.02148175e-01 -8.13134760e-02 -3.62228192e-02  3.80506255e-02
 -7.86402747e-02 -2.01281793e-02  5.02002379e-03  1.45729259e-01
  1.04409298e-02 -1.39027685e-02  1.11154690e-02 -6.20222762e-02
  9.10239220e-02 -7.37913623e-02 -4.15083878e-02  3.25369611e-02
 -1.23255804e-01  2.53378414e-02  1.19637847e-01  8.17944389e-03
  2.00212076e-02 -2.43159775e-02  1.35595892e-02 -8.83679613e-02
 -3.15758251e-02  3.64803709e-02  7.26935565e-02 -9.51444544e-03
  2.54194271e-02 -8.85571241e-02 -2.54622865e-02 -1.14408731e-01
 -5.08983135e-02 -6.31365925e-02  3.24245319e-02  3.40552106e-02
  5.87988272e-02 -1.88004668e-03 -6.26266152e-02  1.66669995e-01
  1.19201124e-01  4.45382930e-02  6.01483397e-02 -8.27206299e-03
 -9.85154882e-02  7.78866783e-02 -1.53992241e-02  1.42525025e-02
 -3.11543774e-02  5.14553487e-02 -1.19240202e-01  4.12306329e-03
 -7.37935677e-02 -2.59105470e-02 -1.80375129e-02 -6.41717166e-02
 -1.26709357e-01  1.05186462e-01  4.82414998e-02  8.39416906e-02
 -9.23505872e-02  2.80472543e-02  1.91233698e-02 -3.86493616e-02
  6.80598915e-02  1.81943759e-01  7.24544674e-02  2.53408682e-02
  2.24920530e-02  6.29734546e-02 -7.23356307e-02  4.58484553e-02
  4.65138964e-02 -2.12726882e-03  3.29738073e-02 -7.56233409e-02
 -5.63905947e-02 -2.50272937e-02 -1.33154660e-01 -9.14965477e-03
 -3.69019061e-02  5.35627268e-02  2.02158943e-01 -1.00977151e-02
  1.55723676e-01  9.80109349e-02  6.90655503e-03  9.96548012e-02
 -1.07611544e-01  2.88127866e-02 -6.42951652e-02 -1.36958063e-01
  5.53582385e-02 -2.78255343e-02 -2.55129691e-02 -4.73257825e-02
 -8.44465941e-03  1.20440172e-02 -4.25070897e-02 -1.04748830e-01
  1.21579366e-02  3.03675979e-03  5.20857424e-02 -2.02560917e-01
 -3.79728228e-02  9.63516906e-03 -1.09964507e-02  9.28812623e-02
  1.96290556e-02  7.86204413e-02  1.03023462e-02 -5.65868020e-02
  1.00512005e-01 -1.32752120e-01  1.24412365e-02  7.16248104e-33
  1.19891018e-01 -5.26563227e-02 -1.77421961e-02  1.51675925e-01
 -2.12321073e-01  1.83724120e-01 -5.97830955e-03 -7.10825156e-03
  4.57776971e-02 -9.86153912e-03 -1.29151061e-01  2.46302132e-02
  1.03053777e-02  3.80883925e-02  1.26627252e-01  7.17195123e-02
 -8.61205906e-02  1.61148757e-02 -7.43382871e-02 -5.68589531e-02
 -6.19590543e-02 -8.01299140e-03  9.71254408e-02 -2.40886137e-02
  6.21751361e-02 -8.09162185e-02  2.36392859e-02 -7.37583218e-03
 -1.51392609e-01  3.70808644e-03 -2.84876078e-02 -5.69950938e-02
  5.14408748e-04  2.88289171e-02 -1.39162689e-01 -1.02060236e-01
  3.09253559e-02  3.40579301e-02  2.55051651e-04 -8.58151615e-02
 -1.17827259e-01  9.74867120e-02 -2.52726750e-04  9.50914472e-02
  4.28357571e-02 -2.27661822e-02  9.03079733e-02  2.51530334e-02
  1.69970992e-03  5.42469434e-02  6.98917732e-02  1.77767515e-01
  1.01885892e-01  8.10813680e-02 -9.86912847e-02 -1.11007482e-01
  1.22445934e-02 -9.82034877e-02 -8.12226757e-02 -1.22838803e-01
 -2.26601232e-02  4.77053151e-02 -6.41654432e-02 -1.08614881e-02
 -3.07090618e-02  1.49837360e-01 -8.34392384e-02  1.04294000e-02
  5.10281883e-03  3.26119661e-02  2.17934623e-02 -1.64406136e-01
 -1.17238581e-01 -1.65872395e-01  1.78990573e-01  6.91657364e-02
  1.64821178e-01 -5.91278300e-02  1.40000418e-01 -4.61488888e-02
 -3.08363363e-02 -1.33843258e-01  1.85990557e-01 -3.38812172e-02
 -5.19311614e-02  3.06764059e-02  1.77480459e-01 -1.51981702e-02
  1.30993556e-02 -1.92077085e-02  4.39916961e-02 -2.84911580e-02
 -5.51112257e-02 -2.18046065e-02  3.31922211e-02 -1.04522695e-32
 -1.88462228e-01 -5.79406954e-02  2.62207408e-02  1.27757281e-01
  1.27181839e-02 -3.79264951e-02 -5.52784167e-02 -2.30416469e-02
 -5.83647713e-02 -2.54219115e-01 -2.61844024e-02  1.71815734e-02
  4.38315831e-02  7.52230585e-02 -7.06655085e-02  6.00856207e-02
 -2.07208153e-02 -1.67866483e-01 -5.72751127e-02  2.57421471e-02
  9.98094380e-02  3.17116439e-01 -1.04443654e-01  1.74125526e-02
 -9.59867090e-02 -8.14053193e-02  9.85870697e-03 -5.54250330e-02
  7.70409703e-02 -1.08991869e-01 -1.89289656e-02 -4.53761034e-02
  1.40071921e-02 -5.28207202e-05  1.14096463e-01  1.12393469e-01
 -3.63129899e-02 -1.55875221e-01 -3.74694355e-02  1.44593731e-01
  8.00276846e-02 -1.40194565e-01  2.20788550e-03 -3.04751918e-02
 -3.28673087e-02 -2.96463221e-02 -5.44768460e-02  1.18773589e-02
 -4.23803180e-02  5.94725125e-02 -5.59582226e-02  9.28407386e-02
 -1.87613741e-02  1.03839673e-01  4.82336096e-02 -1.69674437e-02
 -9.30988863e-02  3.17105800e-02 -2.40745451e-02  1.18087888e-01
 -8.13131183e-02  3.80030856e-03  1.18802942e-01 -1.24176487e-01
 -2.09780242e-02  3.49535681e-02 -5.48393987e-02 -5.74621744e-02
  7.16111511e-02  2.26743314e-02 -3.67218740e-02 -5.19035533e-02
  1.09749446e-02 -5.15675172e-03  1.32910743e-01  1.08401679e-01
 -9.15631652e-02 -3.48400720e-03 -6.74914196e-02  4.15285081e-02
 -1.56219482e-01 -4.18921150e-02 -7.06432983e-02 -1.18980460e-01
 -3.10220085e-02  7.00604022e-02  6.02825470e-02 -2.23237425e-01
 -9.32039618e-02  1.03252558e-02  5.32658678e-03  9.78887454e-03
 -1.19128287e-01  1.99773479e-02 -4.64258976e-02 -1.00708078e-07
  7.26036802e-02  8.89522061e-02 -2.14765109e-02  6.24277778e-02
  2.73415893e-02  1.06242429e-02  1.06720245e-02 -7.04248622e-02
  3.59466411e-02 -1.29500488e-02  2.42287114e-01  3.41082551e-02
 -6.06526509e-02 -5.57040684e-02  1.64863933e-02  4.06796634e-02
 -1.52448341e-02  6.12530559e-02 -8.88596773e-02 -1.06397524e-01
 -2.21159104e-02 -1.11982645e-02 -6.37607574e-02  4.63510975e-02
  4.92123440e-02 -5.90871684e-02 -3.05061154e-02  8.88455510e-02
 -8.40443745e-02 -2.61514988e-02  1.06942005e-01 -5.11405133e-02
  2.10148897e-02 -6.86062425e-02  2.30475832e-02  1.27767712e-01
 -6.39184937e-02  4.55625989e-02  8.07546377e-02 -7.09113032e-02
  4.98737767e-03  7.09691569e-02 -3.51500437e-02  3.77463177e-02
 -4.36865501e-02 -3.76870893e-02  2.67856847e-02  1.61938712e-01
  6.23990111e-02 -1.68922514e-01 -6.50245184e-03  9.39275622e-02
 -3.83484401e-02 -2.91046929e-02  1.84935793e-01  7.51326382e-02
 -4.41047400e-02 -1.81095749e-02 -5.84891066e-02  2.42759418e-02
 -2.44450830e-02  4.38334122e-02 -1.06747225e-01  1.07360572e-01]

with open('nahidOrg_files/article_embeddings.pkl', 'wb') as f:
    pickle.dump(article_embeddings, f)

with open('nahidOrg_files/article_embeddings.pkl', 'rb') as f:
    article_embeddings = pickle.load(f)

dimension = len(article_embeddings[0]) 
index = faiss.IndexFlatL2(dimension)
index.add(np.array(article_embeddings))

def nahidOrg_retrieve_documents(query_embedding, k):
    distances, indices = index.search(np.array([query_embedding]), k)
    return indices[0]

question = "What is the estimated financial impact on products due to the ban on Russian steel imports, according to the Commission?"
question_embeddings = nahidOrg_generate_embeddings(question)

retrieved_articles_indices = nahidOrg_retrieve_documents(question_embeddings, 3)
retrieved_articles_indices

array([0, 4, 2], dtype=int64)

model_name = "t5-base"
tokenizer_t5_base = T5Tokenizer.from_pretrained(model_name, legacy=False)
model_t5_base = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
def nahidOrg_answer_by_t5(question, context, max_length=150):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer_t5_base.encode(input_text, return_tensors="pt").to(device)
    outputs = model_t5_base.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True, return_dict_in_generate=True,output_scores=True)
    answer = tokenizer_t5_base.decode(outputs.sequences[0], skip_special_tokens=True)

    #The following portion is newly added in Tutorial-5 in the method nahidOrg_answer_by_t5 to find out the confidence score. 
    #To achieve this, we needed output scores, so we enabled return_dict_in_generate and output_scores to True in the parameters of model.generate.
    logits = torch.stack(outputs.scores, dim=1)  
    probabilities = softmax(logits, dim=-1)  
    gen_sequence_indices = outputs.sequences[:, 1:].unsqueeze(-1) 
    probabilities = probabilities[:, :gen_sequence_indices.shape[1], :]
    generated_token_probs = probabilities.gather(2, gen_sequence_indices).squeeze(-1) 
    avg_confidence = generated_token_probs.mean().item()
    return answer, avg_confidence

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

all_answers = []
for index in retrieved_articles_indices:
    answer_entry = {}
    answer_entry['author'] = df_filtered['author'][index]
    answer_entry['date_published'] = df_filtered['date_published'][index]
    answer, confidence = nahidOrg_answer_by_t5(question, df_filtered['article'][index])
    answer_entry['answer'] = answer
    answer_entry['confidence'] = confidence
    all_answers.append(answer_entry)

all_answers

[{'author': 'Reuters',
  'date_published': '2022-03-15 11:27:02',
  'answer': '3.3 billion euros ($3.6 billion)',
  'confidence': 0.9915035963058472},
 {'author': 'Matt Egan, CNN Business',
  'date_published': '2022-03-14 12:55:43',
  'answer': 'nearly $10 billion',
  'confidence': 0.31741154193878174},
 {'author': 'Reuters',
  'date_published': '2022-03-15 04:49:54',
  'answer': 'a 1 million yen ($8,487.52) fine',
  'confidence': 0.009797605685889721}]

highest_conf = -1
for answer in all_answers:
    if highest_conf < answer['confidence']:
        highest_conf = answer['confidence']
        answer_with_reference = answer['answer'] + ' [' + answer['author'] + ' ' + answer['date_published'] + ']'
answer_with_reference

'3.3 billion euros ($3.6 billion) [Reuters 2022-03-15 11:27:02]'

retrived_articles_series = pd.Series(retrived_articles_indices)
retrieved_text_articles = retrived_articles_series.apply(lambda x: df_filtered['article'][x]).tolist()
def nahidOrg_processing_context(docs, max_tokens=1536):
    processed_docs = []
    total_tokens = 0
    for doc in docs:
        tokens = len(doc.split())
        if total_tokens + tokens <= max_tokens:
            processed_docs.append(doc)
            total_tokens += tokens
        else:
            break
    return "\n".join(f"- {doc}" for doc in processed_docs)

final_retrived_contexts = nahidOrg_processing_context(retrieved_text_articles)

client = OpenAI(api_key='sk-proj-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
def nahidOrg_answer_by_gpt35_turbo(question, contexts, max_tokens):
    prompt = f"""Answer the question based on the provided context. If the context doesn't contain the answer, say "Sorry Nahid! I don't have enough information to answer that."
    **Question:** {question}
    **Context:** {contexts}
    """
    messages = [
        {"role": "system", "content": "You are an intelligent assistant that answers questions based on the provided context."},
        {"role": "user", "content": prompt}
    ]
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=max_tokens,  
    )
    return response.choices[0].message.content

answer_gpt35 = nahidOrg_answer_by_gpt35_turbo(question, final_retrived_contexts ,250)
answer_gpt35

'The estimated financial impact on products due to the ban on Russian steel imports is 3.3 billion euros ($3.6 billion), according to the Commission.'

all_answers_gpt35 = []
for index in retrieved_articles_indices:
    answer_entry = {}
    answer_entry['author'] = df_filtered['author'][index]
    answer_entry['date_published'] = df_filtered['date_published'][index]
    answer = nahidOrg_answer_by_gpt35_turbo(question, df_filtered['article'][index],250)
    answer_entry['answer'] = answer
    answer_entry['embeddings'] = ""
    all_answers_gpt35.append(answer_entry)

all_answers_gpt35

[{'author': 'Reuters',
  'date_published': '2022-03-15 11:27:02',
  'answer': 'The estimated financial impact on products due to the ban on Russian steel imports is 3.3 billion euros ($3.6 billion), according to the Commission.',
  'embeddings': ''},
 {'author': 'Matt Egan, CNN Business',
  'date_published': '2022-03-14 12:55:43',
  'answer': "Sorry Nahid! I don't have enough information to answer that.",
  'embeddings': ''},
 {'author': 'Reuters',
  'date_published': '2022-03-15 04:49:54',
  'answer': "Sorry Nahid! I don't have enough information to answer that.",
  'embeddings': ''}]

for each_answer in all_answers_gpt35:
    each_answer['embeddings'] = nahidOrg_generate_embeddings(each_answer['answer'])
embeddings = [answer['embeddings'] for answer in all_answers_gpt35]
dimension = len(embeddings[0]) 
index_gpt35_answers = faiss.IndexFlatL2(dimension)
index_gpt35_answers.add(np.array(embeddings))
distances, indices = index_gpt35_answers.search(np.array([nahidOrg_generate_embeddings(answer_gpt35)]), 3)
print(f"Sorted Indices in all_answers_gpt35 dictionary: {indices}\nDistances: {distances}")

Sorted Indices in all_answers_gpt35 dictionary: [[0 1 2]]
Distances: [[ 0.       17.777369 17.777369]]

answer_with_reference_gpt35 = all_answers_gpt35[indices[0][0]]['answer'] + '[' + all_answers_gpt35[indices[0][0]]['author'] + ' ' + all_answers_gpt35[indices[0][0]]['date_published'] + ']'
answer_with_reference_gpt35

'The estimated financial impact on products due to the ban on Russian steel imports is 3.3 billion euros ($3.6 billion), according to the Commission.[Reuters 2022-03-15 11:27:02]'

RAG for Domain-Specific QA with Reference using T5 & GPT-3.5-Turbo¶