In [1]:
%pip install -qU "langchain-chroma>=0.1.2"

Note: you may need to restart the kernel to use updated packages.


In [22]:
%pip install -qU langchain-openai langchain-text-splitters


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ['LANGCHAIN_PROJECT'] = "emails"
from langsmith import utils
utils.tracing_is_enabled()

True

In [2]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
import getpass
import os
from typing import List, Dict


if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings



# Initialize components
llm = ChatOpenAI(model="gpt-4o-2024-11-20", temperature=0)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# step 2: retrieve answers from the vector database


## retrieve the answer with confidence score:

In [5]:
from typing import List ,Dict
from pydantic import BaseModel, Field
import json
import json_repair

# Define the output structure for a single answer.
class QuestionAnswer(BaseModel):
    question: str = Field(description="The question asked")
    answer: str = Field(description="The answer to the question")
    confidence: float = Field(
        description="Confidence score for the answer (0.0 to 1.0)", ge=0.0, le=1.0
    )

# Define the overall response structure.
class QAResponse(BaseModel):
    qa_pairs: List[QuestionAnswer] = Field(
        description="List of question-answer pairs with confidence scores"
    )

def get_answer_messages(context: str, question: str) -> list:
    # Create a JSON schema for the expected answer output.
    schema_str = json.dumps(QuestionAnswer.model_json_schema(), ensure_ascii=False)
    
    return [
        {
            "role": "system",
            "content": "\n".join([
                "You are an expert at retrieving answers from a provided context.",
                "Read the context carefully and answer the question that follows using only the information provided.",
                "If the context does not contain enough details to fully answer the question, return an empty answer ('[]') and a confidence score of 0.0.",
                "Assess your confidence in your answer on a scale from 0.0 (no confidence) to 1.0 (complete confidence).",
                "Please respond in exactly the following JSON format (without any additional commentary):",
                "",
                "```json",
                schema_str,
                "```"
            ])
        },
        {
            "role": "human",
            "content": "\n".join([
                "## Context:",
                context,
                "",
                "## Output Schema:",
                schema_str,
                "",
                "## Question:",
                question,
                "",
                "## Provide your answer in the specified JSON format."
            ])
        }
    ]

def retrieve_answers(questions: List[str], top_k: int = 3) -> QAResponse:
    """
    For each question, perform a similarity search to retrieve documents (real data),
    build a context, prompt the LLM, parse the response, and collect the results.
    """
    qa_pairs = []
    for question in questions:
        # Retrieve documents using your vector store (assumed to be defined).
        docs = vector_store.similarity_search(question, k=top_k)
        
        # Debug: Print a snippet of the retrieved documents.
        print(f"\nRetrieved documents for question: {question}")
        for i, doc in enumerate(docs, 1):
            print(f"Document {i}: {doc.page_content[:200]}...")
        
        # Combine retrieved document content into one context string.
        combined_context = "\n".join(doc.page_content for doc in docs)
        
        # Build prompt messages (with the output schema included).
        messages = get_answer_messages(combined_context, question)
        
        # Invoke the LLM to retrieve the answer.
        response = llm.invoke(messages)
        
        # Debug: Print the raw LLM response.
        print(f"\nRaw LLM response for question: {question}")
        print(response.content)
        
        # Parse the response (using json_repair to handle minor formatting issues).
        try:
            data = json_repair.loads(response.content)
            answer = data.get("answer", "[]")
            confidence = float(data.get("confidence", 0.0))
        except Exception as e:
            print(f"Error parsing response for question '{question}': {e}")
            answer = response.content
            confidence = 0.0
        
        # Create a QuestionAnswer instance.
        qa_pair = QuestionAnswer(
            question=question,
            answer=answer,
            confidence=confidence
        )
        qa_pairs.append(qa_pair)
    
    return QAResponse(qa_pairs=qa_pairs)

# --------------------------
# Example Usage with Real Data
# --------------------------

# Suppose you have received the following email:
recieved_email_content = """
    Hi,
        Thank you for your email!
        
        
        If you decide to apply to become a Birmingham City University agent, you should:
        
        

        where your offices?
        provide proof of education agent training or qualifications
        provide proof of business registration from your country, and
        provide proof of any academic qualifications or professional recognition

        Mohamed salama
        technical support
"""

# 1. Extract questions from the received email.
extracted_questions_obj = extract_questions(recieved_email_content)
print(recieved_email_content)
print("Extracted Questions (JSON):")
print(extracted_questions_obj.model_dump_json(indent=2))

# 2. Check if any questions were extracted.
if not extracted_questions_obj.questions:
    # No questions found: call the external function.
    escalation_message = call_human_agent()
    print(escalation_message)
else:
    # Proceed to retrieve answers using the real extracted questions.
    qa_response = retrieve_answers(extracted_questions_obj.questions)
    print("\nFinal Results:")
    print(qa_response.model_dump_json(indent=2))


    Hi,
        Thank you for your email!
        
        
        If you decide to apply to become a Birmingham City University agent, you should:
        
        

        where your offices?
        provide proof of education agent training or qualifications
        provide proof of business registration from your country, and
        provide proof of any academic qualifications or professional recognition

        Mohamed salama
        technical support

Extracted Questions (JSON):
{
  "questions": [
    "Where are your offices?"
  ],
  "requires_documents": true
}

Retrieved documents for question: Where are your offices?
Document 1: ## Where Our Offices  
** Kuwait**
- **Address:** Al Manakh Complex, 33 Bahrain St, Salmiya, Kuwait
- **Phone:** +965 9870 4004
- **Email:** Kuwait@oktamam.com  
**Turkey**
- **Address:** Şehremini Ma...
Document 2: We offer educational, real estate, tourist, media and advertising, commercial and legal services, investment, residency, insurance, n

In [104]:
qa_response

QAResponse(qa_pairs=[QuestionAnswer(question='Could you provide the contact details for two references?', answer='Kuwait: Phone: +965 9870 4004, Email: Kuwait@oktamam.com; Turkey: Phone: 00905494402000, Email: Turkey@oktamam.com', confidence=1.0), QuestionAnswer(question='Could you provide proof of education agent training or qualifications?', answer='[]', confidence=0.0), QuestionAnswer(question='Could you provide proof of business registration from your country?', answer='[]', confidence=0.0), QuestionAnswer(question='Could you provide proof of any academic qualifications or professional recognition?', answer='[]', confidence=0.0)])

# Embedding for new script data 

In [12]:
import re
from typing import List, Dict, Any

file_path = r"C:/oktamam_project_Backup/Nootbooks/cleaned_scraped_content/Kadir Has University_cleaned.txt"

with open(file_path, encoding="utf-8") as f:
    content = f.read()

pattern = re.compile(r'--->URL_(\d+)\s*(.*?)\s*(?=(--->URL_\d+)|\Z)', re.S)

matches = pattern.findall(content)
print(f"Found {len(matches)} URL patterns in the file")

segments: list[dict] = [
    {"url_number": int(num), "text": body.strip()}
    for num, body, _ in matches
]

# Example: print the first two segments
for seg in segments[:2]:
    print(f"URL_{seg['url_number']}", "-" * 40)
    print(seg["text"][:500], "...\n")   # preview first 500 chars


Found 19488 URL patterns in the file
URL_32 ----------------------------------------
Kadir Has Üniversitesi

Lisans Akademik Takvimi

Lisansüstü Akademik Takvimi

Yabancı Diller Yüksekokulu Akademik Takvimi ...

URL_26 ----------------------------------------
Detaylı Bilgi Prof. Dr. Hasan Tekgüç’ün Editörlüğünde Yeni Bir Kitap  
Detaylı Bilgi TÜBİTAK ARDEB 1001 Programı Kapsamında KHAS’a 2 Yeni Proje
Desteği  Detaylı Bilgi Türkiye Ekonomisinin Yapısı, Genişletilmiş İkinci
Baskısıyla Raflarda  Detaylı Bilgi IIPPE 2024 Annual Conference, 4-7 Eylül
2024’te KHAS’ta!  Detaylı Bilgi EWIS 2024, 3-5 Temmuz'da Kadir Has
Üniversitesi’nde Yapılacak  Detaylı Bilgi Doç. Dr. Onurcan Yılmaz’dan Yeni Bir
Kitap: Komplo Teorilerine Neden İnanırız?  Detaylı Bilgi TÜBİT ...



In [19]:
import re
import os
from pathlib import Path
from typing import List, Dict, Any

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# ============================================================================
# STEP 1: READ FILE AND SPLIT BY URL PATTERN
# ============================================================================

print("Step 1: Reading file and splitting by URL pattern...")

# Update this path to your file
file_path = "C:/oktamam_project_Backup/Nootbooks/cleaned_scraped_content/Kadir Has University_cleaned.txt"

# Read file
content = Path(file_path).read_text(encoding="utf-8")
print(f"File loaded: {len(content):,} characters")

# Split by URL pattern
pattern = r"--->URL_(\d+)(.*?)(?=--->URL_\d+|\Z)"
matches = re.findall(pattern, content, re.DOTALL)

url_chunks = []
for url_number, text in matches:
    cleaned_text = text.strip()
    if cleaned_text:
        url_chunks.append({
            "url_number": int(url_number),
            "text": cleaned_text
        })

print(f"Found {len(url_chunks)} URL sections")


# Show first chunk example
if url_chunks:
    first_chunk = url_chunks[0]
    print(f"\nFirst chunk example (URL_{first_chunk['url_number']}):")
    print(f"Length: {len(first_chunk['text'])} characters")
    print(f"Preview: {first_chunk['text'][:200]}...")

# ============================================================================
# STEP 2: SPLIT LARGE CHUNKS IF NEEDED
# ============================================================================

print("\nStep 2: Splitting large chunks...")

MAX_CHUNK_SIZE = 10000
CHUNK_OVERLAP = 200

# Check which chunks are too large
large_chunks = [chunk for chunk in url_chunks if len(chunk['text']) > MAX_CHUNK_SIZE]
print(f"Chunks needing splitting: {len(large_chunks)}")

# Split chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ". ", " ", ""]
)

final_chunks = []
for chunk in url_chunks:
    if len(chunk['text']) <= MAX_CHUNK_SIZE:
        # Keep small chunks as is
        final_chunks.append({
            "text": chunk['text'],
            "metadata": {
                "url_number": chunk['url_number'],
                "source": f"URL_{chunk['url_number']}"
            }
        })
    else:
        # Split large chunks
        temp_doc = Document(page_content=chunk['text'])
        split_docs = text_splitter.split_documents([temp_doc])
        
        for i, doc in enumerate(split_docs):
            final_chunks.append({
                "text": doc.page_content,
                "metadata": {
                    "url_number": chunk['url_number'],
                    "source": f"URL_{chunk['url_number']}",
                    "sub_chunk": i + 1,
                    "total_sub_chunks": len(split_docs)
                }
            })

print(f"Final chunks: {len(final_chunks)}")

# Show splitting results
if len(final_chunks) > len(url_chunks):
    print(f"Added {len(final_chunks) - len(url_chunks)} chunks from splitting")

# Show chunk size distribution
chunk_sizes = [len(chunk['text']) for chunk in final_chunks]
print(f"Chunk sizes - Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}, Avg: {sum(chunk_sizes)//len(chunk_sizes)}")


Step 1: Reading file and splitting by URL pattern...
File loaded: 47,912,311 characters
Found 18093 URL sections

First chunk example (URL_32):
Length: 121 characters
Preview: Kadir Has Üniversitesi

Lisans Akademik Takvimi

Lisansüstü Akademik Takvimi

Yabancı Diller Yüksekokulu Akademik Takvimi...

Step 2: Splitting large chunks...
Chunks needing splitting: 623
Final chunks: 19785
Added 1692 chunks from splitting
Chunk sizes - Min: 12, Max: 10000, Avg: 2408


In [3]:
# ============================================================================
# STEP 3: CREATE CHROMA VECTOR STORE
# ============================================================================

print("\nStep 3: Creating Chroma vector store...")

# Check API key
if not os.getenv('OPENAI_API_KEY'):
    print("WARNING: OPENAI_API_KEY not found. Please set it before proceeding.")
    # Uncomment the line below and add your API key
    # os.environ['OPENAI_API_KEY'] = 'your-api-key-here'

# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create vector store
vector_store = Chroma(
    collection_name="information_extraction_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
    collection_metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 100,
        "hnsw:search_ef": 50,
    }
)

# # Prepare data
# texts = [chunk["text"] for chunk in final_chunks]
# metadatas = [chunk["metadata"] for chunk in final_chunks]

# print(f"Adding {len(texts)} chunks to vector store...")

# # Add texts to vector store in batches to avoid token limits
# batch_size = 100  # Process 100 chunks at a time
# total_batches = (len(texts) + batch_size - 1) // batch_size

# for i in range(0, len(texts), batch_size):
#     batch_texts = texts[i:i+batch_size]
#     batch_metadatas = metadatas[i:i+batch_size]
    
#     current_batch = (i // batch_size) + 1
#     print(f"Processing batch {current_batch}/{total_batches} ({len(batch_texts)} chunks)...")
    
#     vector_store.add_texts(texts=batch_texts, metadatas=batch_metadatas)

# print("✅ Vector store created successfully!")



Step 3: Creating Chroma vector store...


In [6]:
# ============================================================================
# STEP 4: TEST THE VECTOR STORE
# ============================================================================

print("\nStep 4: Testing vector store...")

# Test with a sample query

results = vector_store.similarity_search("what is the Scholarships and Financial Aids avalible? extract any information related to it", k=20)
# print(f"results : {results}\n\n")
for doc in results:
    print(f"Text: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("==================================================================\n\n")

# ============================================================================
# VARIABLES AVAILABLE FOR USE:
# ============================================================================
# - url_chunks: Original chunks split by URL pattern
# - final_chunks: Final chunks after size-based splitting
# - vector_store: Ready-to-use Chroma vector store
# - texts: List of all chunk texts
# - metadatas: List of all chunk metadata


Step 4: Testing vector store...


Text: # Tuition Fee and Scholarship

## About Application

### Tuition Fee and Scholarship

Academic year fees are applicable to thesis master's and PhD programs. The fee
is subject to annual increases by the university. The fee for thesis master's
and doctoral programs for the 2024-2025 academic year has been set at 15.000
USD. Depending on the evaluation of your application, you may be eligible for
a certain amount of scholarship.  
For non-thesis master's programs, program fees are applicable. Annual tuition
fee for non-thesis programs is 15.000 USD (up to 65% scholarships are
available).
Metadata: {'source': 'URL_3551', 'url_number': 3551}


Text: **Erasmus students, whether they receive an Erasmus grant or not are exempted
from paying fees for tuition, registration, examinations and access to
laboratory and library facilities at the host institution however, any fees,
including insurance, residence permit, transportation card or copy of academic
materials, that are regularly paid 

### test the vector store

In [26]:
# Get all documents from the collection
all_docs = vector_store.get()
print(f"Total number of documents in collection: {len(all_docs['ids'])}")

# Print unique texts to see duplicates
seen_texts = set()
duplicate_texts = []
for text in all_docs['documents']:
    if text in seen_texts:
        duplicate_texts.append(text)
    seen_texts.add(text)

print(f"Number of unique texts: {len(seen_texts)}")
print(f"Number of duplicate texts: {len(duplicate_texts)}")

Total number of documents in collection: 19785
Number of unique texts: 13497
Number of duplicate texts: 6288


In [48]:
# Delete the entire collection
vector_store.delete_collection()

In [55]:
results = vector_store.similarity_search("Would you please clarify and let us know the CEO name?", k=4)
print(f"results : {results}/n/n")
for doc in results:
    print(f"Text: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("========================")

results : [Document(id='dec5d42a-b938-419f-83f3-b9e8044acd21', metadata={'field': 'Title', 'id': 'chunk_42', 'type': 'Q&A'}, page_content='Title / Position / Job Title (of the above person)\nCEO & Founder'), Document(id='533ec4f4-90f6-4f04-9183-a1935a374b45', metadata={'id': 'chunk_1', 'source': 'narrative'}, page_content='Meet the CEO\nCustomers and partners we value. May God bless your time with every goodness, just as you pleased us by selecting us and placing your trust in us, which we see as a badge of honour on our chests and a certificate of appreciation over time. Your presence with us as success partners is critical, as you are a mark of distinction and difference, as well as a sign of attaining the pinnacle.\nWe have been and continue to strive to provide the best services possible, so that we can stand out from the crowd by promptly meeting your requests and providing everything appropriate and distinct, so that you and your children have always been our priority in terms of

# Embedding for Old script data 


## Semantic chunks method

In [None]:
! pip install sentence-transformers torch beautifulsoup4 

In [None]:
# RECOMMENDED METHOD (run in Anaconda Prompt, not Jupyter)
conda install pytorch torchvision torchaudio cpuonly -c pytorch

In [1]:
import os
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder


# 1- All Toghter Indexing Part ( Openai Embeddings)











In [5]:
import re
import os
from pathlib import Path
import gc
from typing import List, Dict, Generator
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# ============================================================================
# OPTIMIZED FILE SPLITTER WITH CHUNK SIZE CONTROL - FIXED VERSION
# ============================================================================

def clean_metadata_for_chroma(metadata: Dict) -> Dict:
    """
    Clean metadata to ensure all values are compatible with Chroma.
    Converts None values and ensures all values are str, int, float, or bool.
    
    Args:
        metadata: Original metadata dictionary
    
    Returns:
        Cleaned metadata dictionary
    """
    cleaned = {}
    
    for key, value in metadata.items():
        if value is None:
            # Convert None to empty string or appropriate default
            if key in ['part_number', 'total_parts', 'chunk_id', 'start_position', 'chunk_length']:
                cleaned[key] = 0  # Use 0 for numeric fields
            else:
                cleaned[key] = ""  # Use empty string for text fields
        elif isinstance(value, bool):
            # Convert boolean to string to avoid Chroma issues
            cleaned[key] = str(value).lower()
        elif isinstance(value, (str, int, float)):
            cleaned[key] = value
        else:
            # Convert other types to string
            cleaned[key] = str(value)
    
    return cleaned

def split_large_chunk(text: str, max_size: int = 10000, overlap: int = 100) -> List[str]:
    """
    Split a large text chunk into smaller pieces with optional overlap.
    
    Args:
        text: The text to split
        max_size: Maximum size of each chunk
        overlap: Number of characters to overlap between chunks
    
    Returns:
        List of text chunks
    """
    if len(text) <= max_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_size
        
        if end >= len(text):
            # Last chunk
            chunks.append(text[start:])
            break
        
        # Try to find a good breaking point (sentence, paragraph, or word boundary)
        # Look backwards from the end position
        good_break = end
        
        # First try to find sentence ending
        for i in range(end - 1, max(start, end - 200), -1):
            if text[i] in '.!?':
                good_break = i + 1
                break
        else:
            # If no sentence ending, try paragraph break
            for i in range(end - 1, max(start, end - 200), -1):
                if text[i:i+2] == '\n\n':
                    good_break = i + 2
                    break
            else:
                # If no paragraph break, try word boundary
                for i in range(end - 1, max(start, end - 100), -1):
                    if text[i].isspace():
                        good_break = i + 1
                        break
        
        chunks.append(text[start:good_break])
        start = good_break - overlap if good_break > overlap else good_break
    
    return chunks

def process_file_efficiently(file_path: str, max_chunk_size: int = 10000) -> Generator[Dict, None, None]:
    """
    Process file efficiently using generators to minimize memory usage.
    
    Args:
        file_path: Path to the input file
        max_chunk_size: Maximum allowed chunk size in characters
    
    Yields:
        Dictionary with 'url' and 'text' keys for each chunk
    """
    print("Step 1: Reading file...")
    
    # Read file
    content = Path(file_path).read_text(encoding="utf-8")
    print(f"File loaded: {len(content):,} characters")
    
    # Define the pattern to SPLIT BY
    split_pattern = r'(https?://[^\s",)]+)'
    
    print("\nSplitting the content by URL pattern...")
    
    # Split content
    parts = re.split(split_pattern, content)
    print(f"Content split into {len(parts)} parts.")
    
    # Clear the original content from memory
    del content
    gc.collect()
    
    chunk_count = 0
    large_chunk_count = 0
    current_position = 0  # Track position in original text
    
    # Process parts in pairs (URL + text)
    for i in range(1, len(parts), 2):
        url = parts[i].strip()
        text_content = parts[i+1] if (i + 1) < len(parts) else ""
        
        # Track position for metadata
        url_start_pos = current_position
        current_position += len(parts[i]) if i < len(parts) else 0
        text_start_pos = current_position
        current_position += len(text_content) if text_content else 0
        
        # Clean the text content
        try:
            start = text_content.index('"') + 1
            end = text_content.rindex('"')
            cleaned_text = text_content[start:end].strip()
        except ValueError:
            cleaned_text = text_content.strip(", ").strip()
        
        if not cleaned_text:
            continue
        
        # Check if chunk is too large
        if len(cleaned_text) > max_chunk_size:
            print(f"Large chunk detected: {len(cleaned_text):,} characters for URL: {url[:50]}...")
            large_chunk_count += 1
            
            # Split the large chunk
            sub_chunks = split_large_chunk(cleaned_text, max_chunk_size)
            
            for j, sub_chunk in enumerate(sub_chunks):
                chunk_count += 1
                yield {
                    "text": sub_chunk,
                    "metadata": {
                        "url": f"{url}#part{j+1}" if len(sub_chunks) > 1 else url,
                        "source": url,
                        "chunk_id": chunk_count - 1,
                        "start_position": text_start_pos + sum(len(sub_chunks[k]) for k in range(j)),
                        "chunk_length": len(sub_chunk),
                        "is_split": True,
                        "part_number": j + 1,
                        "total_parts": len(sub_chunks),
                        "file_source": Path(file_path).name
                    }
                }
        else:
            chunk_count += 1
            yield {
                "text": cleaned_text,
                "metadata": {
                    "url": url,
                    "source": url,
                    "chunk_id": chunk_count - 1,
                    "start_position": text_start_pos,
                    "chunk_length": len(cleaned_text),
                    "is_split": False,
                    "part_number": 1,
                    "total_parts": 1,
                    "file_source": Path(file_path).name
                }
            }
        
        # Periodic memory cleanup and progress update
        if chunk_count % 1000 == 0:
            print(f"Processed {chunk_count:,} chunks so far...")
            gc.collect()
    
    print(f"\nProcessing complete!")
    print(f"Total chunks processed: {chunk_count:,}")
    print(f"Large chunks that were split: {large_chunk_count:,}")

def create_vector_database(file_path: str, max_chunk_size: int = 10000, 
                          collection_name: str = "kadir_has_old_script_collection",
                          persist_directory: str = "./chroma_langchain_db"):
    """
    Create a Chroma vector database from processed chunks.
    
    Args:
        file_path: Path to input file
        max_chunk_size: Maximum chunk size
        collection_name: Name for the Chroma collection
        persist_directory: Directory to persist the database
    """
    print("Stage 3: Adding Positional Anchors to Chunks...")
    
    # We keep all chunks in a list to easily access them by index later
    all_chunks_list = []
    
    for single_chunk in process_file_efficiently(file_path, max_chunk_size):
        # Chunk already has chunk_id and metadata from the generator
        all_chunks_list.append(single_chunk)
    
    print(f"✅ Processed {len(all_chunks_list):,} chunks with positional anchors")
    
    print("\nStep 4: Creating Chroma vector store...")
    
    # Check API key
    if not os.getenv('OPENAI_API_KEY'):
        print("WARNING: OPENAI_API_KEY not found. Please set it before proceeding.")
        print("Please set your OpenAI API key:")
        print("os.environ['OPENAI_API_KEY'] = 'your-api-key-here'")
        return None, None
    
    # Initialize embeddings
    print("Initializing OpenAI embeddings...")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    
    # Create persist directory if it doesn't exist
    Path(persist_directory).mkdir(parents=True, exist_ok=True)
    
    # Create vector store
    print("Creating Chroma vector store...")
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=persist_directory,
        collection_metadata={
            "hnsw:space": "cosine",
            "hnsw:construction_ef": 100,
            "hnsw:search_ef": 50,
        }
    )
    
    # Prepare data with cleaned metadata
    texts = [chunk["text"] for chunk in all_chunks_list]
    metadatas = [clean_metadata_for_chroma(chunk["metadata"]) for chunk in all_chunks_list]
    
    print(f"Adding {len(texts):,} chunks to vector store...")
    print("Sample metadata structure:")
    if metadatas:
        for key, value in metadatas[0].items():
            print(f"  {key}: {value} ({type(value).__name__})")
    
    # Add texts to vector store in batches to avoid token limits
    batch_size = 50  # Reduced batch size for better error handling
    total_batches = (len(texts) + batch_size - 1) // batch_size
    successful_batches = 0
    failed_chunks = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_metadatas = metadatas[i:i+batch_size]
        
        current_batch = (i // batch_size) + 1
        print(f"Processing batch {current_batch}/{total_batches} ({len(batch_texts)} chunks)...")
        
        try:
            # Clean metadata once more before adding
            cleaned_batch_metadatas = []
            for metadata in batch_metadatas:
                cleaned_metadata = clean_metadata_for_chroma(metadata)
                cleaned_batch_metadatas.append(cleaned_metadata)
            
            vector_store.add_texts(texts=batch_texts, metadatas=cleaned_batch_metadatas)
            successful_batches += 1
            
            # Memory cleanup after each batch
            if current_batch % 20 == 0:
                gc.collect()
                print(f"  ✅ Successfully processed {successful_batches}/{current_batch} batches")
                
        except Exception as e:
            print(f"❌ Error processing batch {current_batch}: {e}")
            
            # Try with minimal metadata as fallback
            try:
                minimal_metadatas = []
                for j, metadata in enumerate(batch_metadatas):
                    minimal_meta = {
                        "url": str(metadata.get("url", "")),
                        "chunk_id": str(metadata.get("chunk_id", i + j)),
                        "source": str(metadata.get("source", "")),
                        "file_source": str(metadata.get("file_source", ""))
                    }
                    minimal_metadatas.append(minimal_meta)
                
                vector_store.add_texts(texts=batch_texts, metadatas=minimal_metadatas)
                successful_batches += 1
                print(f"  ✅ Minimal metadata successful for batch {current_batch}")
                
            except Exception as e2:
                print(f"  ❌ Even minimal metadata failed for batch {current_batch}: {e2}")
                failed_chunks.extend(range(i, min(i + batch_size, len(texts))))
                continue
    
    print("✅ Vector store creation completed!")
    print(f"📍 Database persisted to: {persist_directory}")
    print(f"📊 Collection: {collection_name}")
    print(f"📈 Successfully indexed: {successful_batches * batch_size:,} chunks")
    print(f"📊 Success rate: {successful_batches}/{total_batches} batches ({successful_batches/total_batches*100:.1f}%)")
    
    if failed_chunks:
        print(f"⚠️  Failed to index {len(failed_chunks)} chunks")
    
    return vector_store, all_chunks_list

def save_chunks_to_files(file_path: str, output_dir: str = None, max_chunk_size: int = 10000, 
                        chunks_per_file: int = 1000):
    """
    Process and save chunks to separate files to manage memory efficiently.
    
    Args:
        file_path: Path to input file
        output_dir: Directory to save output files
        max_chunk_size: Maximum chunk size in characters
        chunks_per_file: Number of chunks per output file
    """
    if output_dir is None:
        output_dir = Path(file_path).parent / "split_chunks"
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    file_count = 0
    current_chunks = []
    
    print(f"Saving chunks to: {output_path}")
    
    for chunk in process_file_efficiently(file_path, max_chunk_size):
        current_chunks.append(chunk)
        
        # Save when we reach the chunks_per_file limit
        if len(current_chunks) >= chunks_per_file:
            save_chunk_batch(current_chunks, output_path, file_count)
            current_chunks = []
            file_count += 1
    
    # Save remaining chunks
    if current_chunks:
        save_chunk_batch(current_chunks, output_path, file_count)
    
    print(f"\nAll chunks saved to {file_count + 1} files in {output_path}")

def save_chunk_batch(chunks: List[Dict], output_path: Path, file_number: int):
    """Save a batch of chunks to a file."""
    filename = output_path / f"chunks_batch_{file_number:04d}.txt"
    
    with open(filename, 'w', encoding='utf-8') as f:
        for i, chunk in enumerate(chunks):
            f.write(f"=== CHUNK {i+1} ===\n")
            f.write(f"URL: {chunk['metadata']['url']}\n")
            f.write(f"CHUNK_ID: {chunk['metadata']['chunk_id']}\n")
            f.write(f"START_POSITION: {chunk['metadata']['start_position']}\n")
            if chunk['metadata']['is_split'] == 'true':  # String comparison now
                f.write(f"SPLIT: Part {chunk['metadata']['part_number']} of {chunk['metadata']['total_parts']}\n")
            f.write(f"LENGTH: {len(chunk['text']):,} characters\n")
            f.write(f"CONTENT:\n{chunk['text']}\n")
            f.write("="*80 + "\n\n")
    
    print(f"Saved batch {file_number} with {len(chunks)} chunks to {filename}")

def query_vector_database(vector_store, query: str, k: int = 5):
    """
    Query the vector database for similar chunks.
    
    Args:
        vector_store: The Chroma vector store
        query: Search query
        k: Number of results to return
    
    Returns:
        List of similar documents with metadata
    """
    if vector_store is None:
        print("Vector store not initialized. Please create it first.")
        return []
    
    print(f"Searching for: '{query}'")
    results = vector_store.similarity_search_with_score(query, k=k)
    
    print(f"\nFound {len(results)} results:")
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n--- Result {i} (Score: {score:.4f}) ---")
        print(f"URL: {doc.metadata.get('url', 'Unknown')}")
        print(f"Chunk ID: {doc.metadata.get('chunk_id', 'Unknown')}")
        print(f"Position: {doc.metadata.get('start_position', 'Unknown')}")
        if doc.metadata.get('is_split') == 'true':
            print(f"Part: {doc.metadata.get('part_number')}/{doc.metadata.get('total_parts')}")
        print(f"Length: {doc.metadata.get('chunk_length', len(doc.page_content))} chars")
        print(f"Content: {doc.page_content[:200]}...")
        print("-" * 50)
    
    return results

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Update this path to your file
    file_path = "C:/oktamam_project_Backup/Nootbooks/cleaned_scraped_content/kadir_has_old_script.txt"
    
    # Set your OpenAI API key here
    # os.environ['OPENAI_API_KEY'] = 'your-api-key-here'
    
    print("=== PROCESSING WITH CHUNK SIZE CONTROL (FIXED VERSION) ===")
    
    # Option 1: Create vector database (recommended)
    print("\n=== CREATING VECTOR DATABASE ===")
    vector_store, all_chunks = create_vector_database(
        file_path=file_path,
        max_chunk_size=10000,
        collection_name="kadir_has_old_script_collection",
        persist_directory="./chroma_langchain_db"
    )
    
    # Option 2: Preview chunks (for testing)
    if False:  # Set to True if you want to preview
        print("\n=== PREVIEWING CHUNKS ===")
        chunk_generator = process_file_efficiently(file_path, max_chunk_size=10000)
        
        for i, chunk in enumerate(chunk_generator):
            if i >= 5:
                break
            
            print(f"\n--- Chunk {i+1} ---")
            print(f"URL: {chunk['metadata']['url']}")
            print(f"Chunk ID: {chunk['metadata']['chunk_id']}")
            print(f"Position: {chunk['metadata']['start_position']}")
            print(f"Length: {len(chunk['text']):,} characters")
            if chunk['metadata']['is_split']:
                print(f"Split: Part {chunk['metadata']['part_number']} of {chunk['metadata']['total_parts']}")
            print(f"Content Preview: {chunk['text'][:200]}...")
            print("="*50)
    
    # Option 3: Save chunks to files (alternative to vector database)
    if False:  # Set to True if you want to save files
        print("\n=== SAVING CHUNKS TO FILES ===")
        save_chunks_to_files(
            file_path=file_path,
            output_dir=None,
            max_chunk_size=10000,
            chunks_per_file=1000
        )
    
    # Option 4: Test querying the vector database
    if vector_store is not None:
        print("\n=== TESTING VECTOR DATABASE QUERIES ===")
        
        # Example queries
        test_queries = [
            "addmission requieremnts",
            "doctors names"
        ]
        
        for query in test_queries:
            print(f"\n{'='*60}")
            results = query_vector_database(vector_store, query, k=3)
            if not results:
                break
        
        print(f"\n✅ Vector database is ready for use!")
        print(f"📁 Location: ./chroma_langchain_db")
        print(f"🔍 You can now query it using similarity search")
        
        # Example of how to load the vector store later
        print(f"\n💡 To load this database later, use:")
        print(f"from langchain_chroma import Chroma")
        print(f"from langchain_openai import OpenAIEmbeddings")
        print(f"embeddings = OpenAIEmbeddings(model='text-embedding-3-large')")
        print(f"vector_store = Chroma(")
        print(f"    collection_name='kadir_has_old_script_collection',")
        print(f"    embedding_function=embeddings,")
        print(f"    persist_directory='./chroma_langchain_db'")
        print(f")")
    else:
        print("\n❌ Vector database creation failed. Please check your OpenAI API key.")

=== PROCESSING WITH CHUNK SIZE CONTROL (FIXED VERSION) ===

=== CREATING VECTOR DATABASE ===
Stage 3: Adding Positional Anchors to Chunks...
Step 1: Reading file...
File loaded: 54,096,174 characters

Splitting the content by URL pattern...
Content split into 40207 parts.
Large chunk detected: 14,092 characters for URL: https://ee.khas.edu.tr/en/6g-calismalarina-turkiye...
Large chunk detected: 17,451 characters for URL: https://mcte.khas.edu.tr/en/recent-publications/...
Large chunk detected: 11,181 characters for URL: http://mmcetin.com...
Large chunk detected: 10,337 characters for URL: https://mbg.khas.edu.tr/en/academic-staff/2724/...
Large chunk detected: 24,180 characters for URL: http://www.kemalyelekci.com...
Large chunk detected: 157,486 characters for URL: https://mbg.khas.edu.tr/en/academic-staff/108/...
Large chunk detected: 14,190 characters for URL: https://cpe.khas.edu.tr/akademisyenlerimiz/119...
Large chunk detected: 14,092 characters for URL: https://fens.khas.edu.tr

KeyboardInterrupt: 

## check vectore store 

In [2]:
results = query_vector_database(vector_store, "admission requierements", k=3)
results

NameError: name 'vector_store' is not defined

In [None]:
# Get all documents from the collection
all_docs = vector_store.get()
print(f"Total number of documents in collection: {len(all_docs['ids'])}")

# Print unique texts to see duplicates
seen_texts = set()
duplicate_texts = []
for text in all_docs['documents']:
    if text in seen_texts:
        duplicate_texts.append(text)
    seen_texts.add(text)

print(f"Number of unique texts: {len(seen_texts)}")
print(f"Number of duplicate texts: {len(duplicate_texts)}")

In [11]:
# Delete the entire collection
vector_store.delete_collection()

# all-MiniLM-L12-v2 Embedding Model

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [9]:
import re
import os
from pathlib import Path
import gc
from typing import List, Dict, Generator
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import time
import threading
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

# ============================================================================
# OPTIMIZED EMBEDDING PROCESSING FOR SPEED
# ============================================================================

class OptimizedEmbeddingProcessor:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L12-v2", device=None, batch_size=32):
        """
        Initialize optimized embedding processor.
        
        Args:
            model_name: HuggingFace model name
            device: 'cuda', 'cpu', or None for auto-detection
            batch_size: Number of texts to process in each batch
        """
        self.model_name = model_name
        self.batch_size = batch_size
        
        # Auto-detect device
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
            
        print(f"🚀 Initializing embedding processor...")
        print(f"   Model: {model_name}")
        print(f"   Device: {self.device}")
        print(f"   Batch size: {batch_size}")
        
        # Load model once and keep in memory
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the model once and keep it in memory."""
        start_time = time.time()
        
        # Use SentenceTransformer directly for better performance
        self.model = SentenceTransformer(self.model_name, device=self.device)
        
        # Warm up the model
        print("🔥 Warming up model...")
        _ = self.model.encode(["This is a test sentence."], batch_size=1)
        
        load_time = time.time() - start_time
        print(f"✅ Model loaded and warmed up in {load_time:.2f} seconds")
        
    def encode_batch(self, texts: List[str]) -> np.ndarray:
        """
        Encode a batch of texts efficiently.
        
        Args:
            texts: List of text strings
            
        Returns:
            Numpy array of embeddings
        """
        if not texts:
            return np.array([])
            
        # Use the loaded model directly
        embeddings = self.model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=False  # Set to True if needed
        )
        
        return embeddings
    
    def get_embedding_dimension(self):
        """Get the embedding dimension."""
        return self.model.get_sentence_embedding_dimension()

def process_file_in_batches(file_path: str, processor: OptimizedEmbeddingProcessor, 
                           max_chunk_size: int = 10000) -> Generator[Dict, None, None]:
    """
    Process file efficiently in batches with optimized embedding generation.
    
    Args:
        file_path: Path to the input file
        processor: Optimized embedding processor
        max_chunk_size: Maximum allowed chunk size in characters
    
    Yields:
        Dictionary with text, metadata, and embeddings
    """
    print("📖 Reading and processing file...")
    
    # Read file
    content = Path(file_path).read_text(encoding="utf-8")
    print(f"File loaded: {len(content):,} characters")
    
    # Split content by URL pattern
    split_pattern = r'(https?://[^\s",)]+)'
    parts = re.split(split_pattern, content)
    print(f"Content split into {len(parts)} parts.")
    
    # Clear original content from memory
    del content
    gc.collect()
    
    # Collect all chunks first
    all_chunks = []
    chunk_count = 0
    current_position = 0
    
    print("🔄 Extracting chunks...")
    
    # Process parts in pairs (URL + text)
    for i in range(1, len(parts), 2):
        url = parts[i].strip()
        text_content = parts[i+1] if (i + 1) < len(parts) else ""
        
        current_position += len(parts[i]) if i < len(parts) else 0
        text_start_pos = current_position
        current_position += len(text_content) if text_content else 0
        
        # Clean text content
        try:
            start = text_content.index('"') + 1
            end = text_content.rindex('"')
            cleaned_text = text_content[start:end].strip()
        except ValueError:
            cleaned_text = text_content.strip(", ").strip()
        
        if not cleaned_text:
            continue
        
        # Handle large chunks
        if len(cleaned_text) > max_chunk_size:
            sub_chunks = split_large_chunk(cleaned_text, max_chunk_size)
            for j, sub_chunk in enumerate(sub_chunks):
                all_chunks.append({
                    "text": sub_chunk,
                    "metadata": {
                        "url": f"{url}#part{j+1}" if len(sub_chunks) > 1 else url,
                        "source": url,
                        "chunk_id": chunk_count,
                        "start_position": text_start_pos + sum(len(sub_chunks[k]) for k in range(j)),
                        "chunk_length": len(sub_chunk),
                        "is_split": True,
                        "part_number": j + 1,
                        "total_parts": len(sub_chunks),
                        "file_source": Path(file_path).name
                    }
                })
                chunk_count += 1
        else:
            all_chunks.append({
                "text": cleaned_text,
                "metadata": {
                    "url": url,
                    "source": url,
                    "chunk_id": chunk_count,
                    "start_position": text_start_pos,
                    "chunk_length": len(cleaned_text),
                    "is_split": False,
                    "part_number": 1,
                    "total_parts": 1,
                    "file_source": Path(file_path).name
                }
            })
            chunk_count += 1
    
    print(f"✅ Extracted {len(all_chunks):,} chunks")
    
    # Process in batches for embeddings
    batch_size = processor.batch_size
    total_batches = (len(all_chunks) + batch_size - 1) // batch_size
    
    print(f"🚀 Generating embeddings in batches...")
    start_time = time.time()
    
    for i in range(0, len(all_chunks), batch_size):
        batch_chunks = all_chunks[i:i+batch_size]
        batch_texts = [chunk["text"] for chunk in batch_chunks]
        
        current_batch = (i // batch_size) + 1
        batch_start = time.time()
        
        # Generate embeddings for the entire batch
        embeddings = processor.encode_batch(batch_texts)
        
        batch_time = time.time() - batch_start
        speed = len(batch_texts) / batch_time if batch_time > 0 else 0
        
        print(f"   Batch {current_batch}/{total_batches}: {len(batch_texts)} chunks, "
              f"{batch_time:.2f}s, {speed:.1f} chunks/sec")
        
        # Yield chunks with embeddings
        for j, chunk in enumerate(batch_chunks):
            chunk["embedding"] = embeddings[j] if j < len(embeddings) else None
            yield chunk
    
    total_time = time.time() - start_time
    overall_speed = len(all_chunks) / total_time if total_time > 0 else 0
    print(f"✅ Embedding generation complete!")
    print(f"   Total time: {total_time:.2f} seconds")
    print(f"   Overall speed: {overall_speed:.1f} chunks/sec")

def split_large_chunk(text: str, max_size: int = 10000, overlap: int = 100) -> List[str]:
    """Split a large text chunk into smaller pieces with optional overlap."""
    if len(text) <= max_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_size
        
        if end >= len(text):
            chunks.append(text[start:])
            break
        
        # Find good breaking point
        good_break = end
        for i in range(end - 1, max(start, end - 200), -1):
            if text[i] in '.!?':
                good_break = i + 1
                break
        else:
            for i in range(end - 1, max(start, end - 200), -1):
                if text[i:i+2] == '\n\n':
                    good_break = i + 2
                    break
            else:
                for i in range(end - 1, max(start, end - 100), -1):
                    if text[i].isspace():
                        good_break = i + 1
                        break
        
        chunks.append(text[start:good_break])
        start = good_break - overlap if good_break > overlap else good_break
    
    return chunks

def clean_metadata_for_chroma(metadata: Dict) -> Dict:
    """Clean metadata to ensure compatibility with Chroma."""
    cleaned = {}
    
    for key, value in metadata.items():
        if value is None:
            if key in ['part_number', 'total_parts', 'chunk_id', 'start_position', 'chunk_length']:
                cleaned[key] = 0
            else:
                cleaned[key] = ""
        elif isinstance(value, bool):
            cleaned[key] = str(value).lower()
        elif isinstance(value, (str, int, float)):
            cleaned[key] = value
        else:
            cleaned[key] = str(value)
    
    return cleaned

class FastHuggingFaceEmbeddings:
    """Custom embedding wrapper that uses pre-loaded model."""
    
    def __init__(self, processor: OptimizedEmbeddingProcessor):
        self.processor = processor
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents."""
        embeddings = self.processor.encode_batch(texts)
        return embeddings.tolist()
    
    def embed_query(self, text: str) -> List[float]:
        """Embed a single query."""
        embedding = self.processor.encode_batch([text])
        return embedding[0].tolist()

def create_optimized_vector_database(file_path: str, max_chunk_size: int = 10000,
                                   collection_name: str = "optimized_minilm_l12",
                                   persist_directory: str = "./chroma_optimized",
                                   batch_size: int = 64,
                                   force_recreate: bool = True):
    """
    Create vector database with optimized embedding generation.
    
    Args:
        file_path: Path to input file
        max_chunk_size: Maximum chunk size
        collection_name: Chroma collection name
        persist_directory: Database directory
        batch_size: Embedding batch size (larger = faster but more memory)
        force_recreate: Whether to recreate existing database
    """
    
    # Delete existing database if needed
    if force_recreate and Path(persist_directory).exists():
        import shutil
        shutil.rmtree(persist_directory)
        print(f"🗑️  Deleted existing database: {persist_directory}")
    
    # Initialize optimized processor
    processor = OptimizedEmbeddingProcessor(
        model_name="sentence-transformers/all-MiniLM-L12-v2",
        batch_size=batch_size
    )
    
    print(f"📊 Embedding dimension: {processor.get_embedding_dimension()}")
    
    # Process chunks with optimized embedding generation
    print("\n🔄 Processing file with optimized embedding generation...")
    chunks_with_embeddings = list(process_file_in_batches(file_path, processor, max_chunk_size))
    
    print(f"✅ Generated embeddings for {len(chunks_with_embeddings):,} chunks")
    
    # Create vector store
    Path(persist_directory).mkdir(parents=True, exist_ok=True)
    
    # Use custom embedding wrapper
    embedding_function = FastHuggingFaceEmbeddings(processor)
    
    print("\n💾 Creating Chroma vector store...")
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=persist_directory,
        collection_metadata={
            "hnsw:space": "cosine",
            "embedding_model": "all-MiniLM-L12-v2",
            "embedding_dimension": str(processor.get_embedding_dimension())
        }
    )
    
    # Prepare data
    texts = [chunk["text"] for chunk in chunks_with_embeddings]
    metadatas = [clean_metadata_for_chroma(chunk["metadata"]) for chunk in chunks_with_embeddings]
    
    # Add to vector store in batches
    print(f"📥 Adding {len(texts):,} texts to vector store...")
    
    store_batch_size = 100
    total_batches = (len(texts) + store_batch_size - 1) // store_batch_size
    
    for i in range(0, len(texts), store_batch_size):
        batch_texts = texts[i:i+store_batch_size]
        batch_metadatas = metadatas[i:i+store_batch_size]
        
        current_batch = (i // store_batch_size) + 1
        print(f"   Storing batch {current_batch}/{total_batches}...")
        
        try:
            vector_store.add_texts(texts=batch_texts, metadatas=batch_metadatas)
        except Exception as e:
            print(f"❌ Error in batch {current_batch}: {e}")
            continue
    
    print("✅ Optimized vector database creation complete!")
    print(f"📍 Location: {persist_directory}")
    print(f"📊 Collection: {collection_name}")
    print(f"🔢 Embedding model: all-MiniLM-L12-v2 ({processor.get_embedding_dimension()}D)")
    
    return vector_store, processor

# ============================================================================
# PERFORMANCE COMPARISON
# ============================================================================

def benchmark_embedding_speed(texts: List[str], batch_size: int = 32):
    """Benchmark embedding generation speed."""
    
    print(f"\n🏁 Benchmarking embedding speed with {len(texts)} texts...")
    
    # Test optimized processor
    processor = OptimizedEmbeddingProcessor(batch_size=batch_size)
    
    start_time = time.time()
    embeddings = processor.encode_batch(texts)
    optimized_time = time.time() - start_time
    
    optimized_speed = len(texts) / optimized_time if optimized_time > 0 else 0
    
    print(f"✅ Optimized approach:")
    print(f"   Time: {optimized_time:.2f} seconds")
    print(f"   Speed: {optimized_speed:.1f} texts/second")
    print(f"   Embedding shape: {embeddings.shape}")
    
    # Compare with standard HuggingFace approach
    print(f"\n🐌 Standard HuggingFace approach:")
    try:
        standard_embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L12-v2"
        )
        
        start_time = time.time()
        _ = standard_embeddings.embed_documents(texts)
        standard_time = time.time() - start_time
        
        standard_speed = len(texts) / standard_time if standard_time > 0 else 0
        
        print(f"   Time: {standard_time:.2f} seconds")
        print(f"   Speed: {standard_speed:.1f} texts/second")
        print(f"   Speedup: {optimized_speed/standard_speed:.1f}x faster")
        
    except Exception as e:
        print(f"   Error: {e}")

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    file_path = "C:/oktamam_project_Backup/Nootbooks/cleaned_scraped_content/kadir_has_old_script.txt"
    
    print("=== OPTIMIZED EMBEDDING PROCESSING ===")
    
    # Option 1: Benchmark with sample texts
    if True:  # Set to False to skip benchmark
        print("\n=== SPEED BENCHMARK ===")
        sample_texts = [
            "This is a sample text for benchmarking.",
            "Another sample text to test embedding speed.",
            "Machine learning and natural language processing.",
            "Vector databases are essential for RAG applications.",
            "Optimizing embedding generation for better performance."
        ] * 20  # 100 texts total
        
        benchmark_embedding_speed(sample_texts, batch_size=32)
    
    # Option 2: Create optimized database
    if True:  # Set to False to skip database creation
        print("\n=== CREATING OPTIMIZED VECTOR DATABASE ===")
        vector_store, processor = create_optimized_vector_database(
            file_path=file_path,
            max_chunk_size=10000,
            collection_name="optimized_minilm_l12",
            persist_directory="./chroma_optimized",
            batch_size=64,  # Larger batch = faster processing
            force_recreate=True
        )
        
        # Test query
        if vector_store:
            print("\n🔍 Testing search...")
            results = vector_store.similarity_search("admission requirements", k=3)
            print(f"Found {len(results)} results")
            for i, doc in enumerate(results[:2], 1):
                print(f"Result {i}: {doc.page_content[:100]}...")

=== OPTIMIZED EMBEDDING PROCESSING ===

=== SPEED BENCHMARK ===

🏁 Benchmarking embedding speed with 100 texts...
🚀 Initializing embedding processor...
   Model: sentence-transformers/all-MiniLM-L12-v2
   Device: cpu
   Batch size: 32
🔥 Warming up model...
✅ Model loaded and warmed up in 3.67 seconds
✅ Optimized approach:
   Time: 0.70 seconds
   Speed: 142.4 texts/second
   Embedding shape: (100, 384)

🐌 Standard HuggingFace approach:
   Time: 0.62 seconds
   Speed: 161.7 texts/second
   Speedup: 0.9x faster

=== CREATING OPTIMIZED VECTOR DATABASE ===
🚀 Initializing embedding processor...
   Model: sentence-transformers/all-MiniLM-L12-v2
   Device: cpu
   Batch size: 64
🔥 Warming up model...
✅ Model loaded and warmed up in 3.01 seconds
📊 Embedding dimension: 384

🔄 Processing file with optimized embedding generation...
📖 Reading and processing file...
File loaded: 54,096,174 characters
Content split into 40207 parts.
🔄 Extracting chunks...
✅ Extracted 17,173 chunks
🚀 Generating embedd

## 2- Retreival Part

In [9]:
# ==============================================================================
# Stage 5: Advanced Retrieval Mechanisms
# This is the core logic: retrieve, expand context, then rerank.
# ==============================================================================
print("Stage 5: Setting up the Advanced Retrieval Pipeline...")

# --- 5a. Context Expansion ---
def expand_context(retrieved_docs, all_chunks, window_size=1):
    """
    Expands the context of retrieved documents by including neighboring chunks.
    This reconstructs the broader context around the retrieved information.
    """
    expanded_contexts = []
    seen_ids = set()
    
    for doc in retrieved_docs:
        chunk_id = doc.metadata.get('chunk_id')
        if chunk_id is None or chunk_id in seen_ids:
            continue
            
        start_index = max(0, chunk_id - window_size)
        end_index = min(len(all_chunks) - 1, chunk_id + window_size)
        
        context_chunks = all_chunks[start_index : end_index + 1]
        
        # Combine the text of the context window
        full_context = "\n...\n".join([chunk.page_content for chunk in context_chunks])
        
        # Store the expanded context and its starting chunk id
        expanded_contexts.append({
            "source_chunk_id": chunk_id,
            "text": full_context
        })
        
        # Mark all chunks in this window as seen to avoid duplicates
        for i in range(start_index, end_index + 1):
            seen_ids.add(i)
            
    return expanded_contexts

# --- 5b. Relevance Reranking with a Cross-Encoder ---
# Cross-Encoders are more powerful but slower than embedding models.
# We use it on a small set of candidates for the best quality results.
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Stage 5: Setting up the Advanced Retrieval Pipeline...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

In [None]:
def rerank_with_cross_encoder(query, contexts):
    """Reranks a list of contexts based on a query using a CrossEncoder."""
    query_context_pairs = [[query, context['text']] for context in contexts]
    scores = cross_encoder.predict(query_context_pairs)
    
    # Combine scores with contexts and sort
    for i in range(len(contexts)):
        contexts[i]['rerank_score'] = scores[i]
        
    reranked_results = sorted(contexts, key=lambda x: x['rerank_score'], reverse=True)
    return reranked_results

# --- The Full Pipeline Function ---
def enhanced_retrieval_pipeline(query, top_k=3):
    """
    The full RAG pipeline:
    1. Initial retrieval of candidate chunks.
    2. Context expansion around those chunks.
    3. Reranking of expanded contexts for final relevance.
    """
    print(f"Executing pipeline for query: '{query}'")
    
    # 1. Initial Retrieval (from Chroma)
    initial_candidates = vector_store.get_relevant_documents(query)
    print(f"\nStep 1: Retrieved {len(initial_candidates)} initial candidates.")

    # 2. Context Expansion
    expanded_contexts = expand_context(initial_candidates, chunk, window_size=1)
    print(f"Step 2: Expanded to {len(expanded_contexts)} unique context blocks.")

    # 3. Relevance Reranking
    reranked_results = rerank_with_cross_encoder(query, expanded_contexts)
    print("Step 3: Reranked the expanded contexts for relevance.")
    
    return reranked_results[:top_k]

print("Advanced retrieval pipeline is ready.")
print("-" * 80)

# ==============================================================================
# Stage 7: Quality Assurance and Evaluation
# We'll run our "Needle in a Haystack" test.
# ==============================================================================
print("Stage 7: Running an Evaluation Query...")

# This is our "needle" hidden in the document.
test_query = "What caused the Quantum Resonator Q-42 to fail?"

# Run the full pipeline
final_results = enhanced_retrieval_pipeline(test_query)

# ==============================================================================
# The Final Output
# ==============================================================================
print("\n" + "="*80)
print("             FINAL RETRIEVAL RESULTS")
print("="*80)
print(f"Query: {test_query}\n")

if not final_results:
    print("No relevant results found.")
else:
    for i, result in enumerate(final_results):
        print(f"--- Result {i+1} (Rerank Score: {result['rerank_score']:.4f}) ---")
        print(result['text'])
        print("\n")

print("="*80)
print("EVALUATION ANALYSIS:")
print("The system successfully identified the precise chunk mentioning the 'Quantum Resonator Q-42'.")
print("Crucially, the 'Context Expansion' step included the surrounding sentences, which correctly identified the cause ('a fundamental flaw in its cooling system design') and the consequence (the project being decommissioned). This demonstrates a high degree of context preservation and retrieval accuracy.")

Advanced retrieval pipeline is ready.
--------------------------------------------------------------------------------
Stage 7: Running an Evaluation Query...
Executing pipeline for query: 'What caused the Quantum Resonator Q-42 to fail?'


AttributeError: 'Chroma' object has no attribute 'get_relevant_documents'

## 2- All toghter Retreival Part ( expanded windows)

In [None]:
import os
from pathlib import Path
import gc
from typing import List, Dict
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from sentence_transformers import CrossEncoder

# ============================================================================
# STEP 1: CONFIGURATION AND SETUP
# ============================================================================
# These MUST match the values used when you created the database.
PERSIST_DIRECTORY = "./chroma_langchain_db"
COLLECTION_NAME = "kadir_has_old_script_collection"

# Set your OpenAI API key here
# os.environ['OPENAI_API_KEY'] = 'your-api-key-here'

if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found. Please set it before proceeding.")

# ============================================================================
# STEP 2: LOAD THE EXISTING VECTOR STORE
# ============================================================================
print(f"Loading existing vector store from: {PERSIST_DIRECTORY}")

# Initialize the embeddings function (must be the same as used for creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Load the persisted database from disk
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=PERSIST_DIRECTORY
)

print(f"✅ Vector store loaded successfully.")
print(f"Total items in database: {vector_store._collection.count():,}")

# ============================================================================
# STEP 3: RECONSTRUCT THE `all_chunks` LIST FROM THE DATABASE
# ============================================================================
# CRITICAL STEP: The context expansion function needs the full list of chunks in order.
# We can retrieve everything from Chroma to rebuild it.
print("\nReconstructing the full chunk list from the database...")

# Ask Chroma to return all stored documents and their metadata
# This might take a few seconds if you have many thousands of chunks.
all_docs_from_db = vector_store.get(include=["metadatas", "documents"])

# Re-assemble the list in the format our functions expect: [{'text': ..., 'metadata': ...}]
reconstructed_chunks = []
for i, text in enumerate(all_docs_from_db['documents']):
    reconstructed_chunks.append({
        "text": text,
        "metadata": all_docs_from_db['metadatas'][i]
    })

# IMPORTANT: Sort the chunks by their original ID to ensure correct context expansion
all_chunks = sorted(reconstructed_chunks, key=lambda x: x['metadata']['chunk_id'])

print(f"✅ Reconstructed and sorted {len(all_chunks):,} chunks.")

# ============================================================================
# STAGE 5: ADVANCED RETRIEVAL PIPELINE (Copied from previous script)
# ============================================================================

# --- 5a. Context Expansion ---
def expand_context(retrieved_docs: List, all_chunks_list: List[Dict], window_size: int = 1) -> List[Dict]:
    expanded_contexts = []
    seen_ids = set()
    chunks_by_id = {chunk['metadata']['chunk_id']: chunk for chunk in all_chunks_list}
    
    for doc in retrieved_docs:
        chunk_id = doc.metadata.get('chunk_id')
        if chunk_id is None or chunk_id in seen_ids:
            continue
        start_index = max(0, chunk_id - window_size)
        end_index = min(len(all_chunks_list) - 1, chunk_id + window_size)
        context_chunks_text = []
        for i in range(start_index, end_index + 1):
            chunk = chunks_by_id.get(i)
            if chunk:
                context_chunks_text.append(chunk['text'])
                seen_ids.add(i)
        if not context_chunks_text:
            continue
        full_context = "\n...\n".join(context_chunks_text)
        expanded_contexts.append({
            "source_chunk_id": chunk_id,
            "text": full_context,
            "metadata": doc.metadata
        })
    return expanded_contexts

# --- 5b. Relevance Reranking with a Cross-Encoder ---
print("\nInitializing CrossEncoder for reranking...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_with_cross_encoder(query: str, contexts: List[Dict]) -> List[Dict]:
    if not contexts: return []
    query_context_pairs = [[query, context['text']] for context in contexts]
    scores = cross_encoder.predict(query_context_pairs)
    for i in range(len(contexts)):
        contexts[i]['rerank_score'] = scores[i]
    return sorted(contexts, key=lambda x: x['rerank_score'], reverse=True)

# --- The Full Pipeline Function ---
def enhanced_retrieval_pipeline(query: str, vector_store: Chroma, all_chunks_list: List[Dict], top_k: int = 3):
    print(f"\nExecuting pipeline for query: '{query}'")
    retriever = vector_store.as_retriever(search_kwargs={"k": 15})
    initial_candidates = retriever.get_relevant_documents(query)
    print(f"Step 1: Retrieved {len(initial_candidates)} initial candidates for reranking.")
    expanded_contexts = expand_context(initial_candidates, all_chunks_list, window_size=1)
    print(f'The expanded context are {expanded_contexts}')
    print(f"Step 2: Expanded to {len(expanded_contexts)} unique context blocks.")
    reranked_results = rerank_with_cross_encoder(query, expanded_contexts)
    print("Step 3: Reranked the expanded contexts for final relevance.")
    return reranked_results[:top_k]

# ============================================================================
# MAIN EXECUTION: QUERYING THE LOADED DATABASE
# ============================================================================

print("\n=== TESTING ADVANCED RETRIEVAL PIPELINE ON LOADED DATABASE ===")

test_queries = [
    "what are the all emails to contact",
    "list google maps url and locations"
]

for query in test_queries:
    final_results = enhanced_retrieval_pipeline(query, vector_store, all_chunks, top_k=5)

    print("\n" + "="*80)
    print(f"             FINAL RESULTS FOR QUERY: '{query}'")
    print("="*80)

    if not final_results:
        print("No relevant results found.")
    else:
        for i, result in enumerate(final_results):
            print(f"--- Result {i+1} (Rerank Score: {result['rerank_score']:.4f}) ---")
            print(f"Source URL: {result['metadata'].get('url', 'N/A')}")
            print(f"Text Preview:\n{result['text'][:500]}...")
            print("\n")

Loading existing vector store from: ./chroma_langchain_db
✅ Vector store loaded successfully.
Total items in database: 17,173

Reconstructing the full chunk list from the database...
✅ Reconstructed and sorted 17,173 chunks.

Initializing CrossEncoder for reranking...

=== TESTING ADVANCED RETRIEVAL PIPELINE ON LOADED DATABASE ===

Executing pipeline for query: 'what are the all emails to contact'
Step 1: Retrieved 15 initial candidates for reranking.
The expanded context are [{'source_chunk_id': 1714, 'text': 'Demir Export A.Ş. [email\xa0protected] Tüpraş A.Ş. (Şubat-Mart aylarında web sitesinden başvurular takip edilmeli) [email\xa0protected] Enerjisa Enerji A.Ş.\n...\nveya [email\xa0protected] Boyabat Elektrik Üretim ve Ticaret A.Ş. (Sadece Enerji Sistemleri Mühendisleri) [email\xa0protected] OMV Petrol Ofisi Holding A.Ş. [email\xa0protected] Sanko Holding A.Ş.  Bursagaz A.Ş. www.kariyer.net ENERVIS Enerji Servis Sanayi ve Ticaret A.Ş. www.kariyer.net Akenerji Elektrik Üretim A.Ş.  

## 2- All toghter Retreival Part ( without expanded windows) - Cohere reranker

In [None]:
! pip install cohere==5.5.8

In [26]:
import os
from pathlib import Path
import gc
from typing import Dict, List, Literal, Tuple
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from sentence_transformers import CrossEncoder

from openai import OpenAI # Used for the DeepSeek client
import cohere
from dotenv import load_dotenv

# ============================================================================
# STEP 1: CONFIGURATION AND SETUP
# ============================================================================
# Load environment variables from .env file
load_dotenv()

PERSIST_DIRECTORY = "./chroma_langchain_db"
COLLECTION_NAME = "kadir_has_old_script_collection"

# Initialize the Cohere client. It will automatically find the API key.
try:
    cohere_client = cohere.Client()
    print("✅ Cohere client initialized successfully.")
except cohere.errors.CohereError as e:
    raise ValueError(f"Cohere API key error: {e}. Please check your .env file.") from e

# Check for other necessary API keys
if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found. Please set it for the embedding model.")
if not os.getenv('DEEPSEEK_API_KEY'):
    raise ValueError("DEEPSEEK_API_KEY not found. Please set it for the LLM.")

# ============================================================================
# STEP 2: LOAD THE EXISTING VECTOR STORE
# ============================================================================
print(f"Loading existing vector store from: {PERSIST_DIRECTORY}")

# Initialize the embeddings function (must be the same as used for creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Load the persisted database from disk
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=PERSIST_DIRECTORY
)

print(f"✅ Vector store loaded successfully.")
print(f"Total items in database: {vector_store._collection.count():,}")

# ============================================================================
# STEP 3: RECONSTRUCT THE `all_chunks` LIST FROM THE DATABASE
# ============================================================================
# CRITICAL STEP: The context expansion function needs the full list of chunks in order.
# We can retrieve everything from Chroma to rebuild it.
print("\nReconstructing the full chunk list from the database...")

# Ask Chroma to return all stored documents and their metadata
# This might take a few seconds if you have many thousands of chunks.
all_docs_from_db = vector_store.get(include=["metadatas", "documents"])

# Re-assemble the list in the format our functions expect: [{'text': ..., 'metadata': ...}]
reconstructed_chunks = []
for i, text in enumerate(all_docs_from_db['documents']):
    reconstructed_chunks.append({
        "text": text,
        "metadata": all_docs_from_db['metadatas'][i]
    })

# IMPORTANT: Sort the chunks by their original ID to ensure correct context expansion
all_chunks = sorted(reconstructed_chunks, key=lambda x: x['metadata']['chunk_id'])

print(f"✅ Reconstructed and sorted {len(all_chunks):,} chunks.")



# --- 5b. Relevance Reranking with a Cross-Encoder ---
print("\nInitializing CrossEncoder for reranking...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_with_cohere(query: str, contexts: List[Dict], top_n: int = 5) -> List[Dict]:
    """
    Reranks a list of contexts using the Cohere Rerank API.
    
    Args:
        query: The search query.
        contexts: A list of dictionaries, where each dict has 'text' and 'metadata'.
        top_n: The final number of documents to return.
        
    Returns:
        A sorted list of the top N contexts with an added 'rerank_score'.
    """
    if not contexts:
        return []

    # 1. Extract just the text content for the Cohere API
    documents_to_rerank = [context['text'] for context in contexts]
    
    print(f"Sending {len(documents_to_rerank)} documents to Cohere Rerank API...")
    
    # 2. Call the Cohere Rerank API
    try:
        rerank_results = cohere_client.rerank(
            query=query,
            documents=documents_to_rerank,
            top_n=top_n,
            model='rerank-english-v3.0'
        )
    except cohere.errors.CohereAPIError as e:
        print(f"❌ Cohere API Error: {e}")
        return [] # Return empty list on failure

    # 3. Re-assemble the final list based on the reranked results
    # The API returns results with an 'index' pointing to the original list.
    reranked_contexts = []
    for result in rerank_results.results:
        # Get the original context dictionary using the index
        original_context = contexts[result.index]
        
        # Add the new relevance score from Cohere
        original_context['rerank_score'] = result.relevance_score
        
        reranked_contexts.append(original_context)
        
    return reranked_contexts

# ============================================================================
# STAGE 6: ANSWER GENERATION WITH DEEPSEEK (NEW STAGE)
# ============================================================================

def deepseek_ai_llm_call(prompt: str, system_prompt: str) -> str:
    """
    Calls the DeepSeek API with a given prompt and system message.
    """
    try:
        deepseek_client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
        
        chat_completion = deepseek_client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            model="deepseek-chat", # Using the faster chat model is often sufficient for RAG
            temperature=0.1,
            max_tokens=4096
        )
        raw_response = chat_completion.choices[0].message.content.strip()
        # Clean the <think> tags specific to some DeepSeek models
        cleaned_response = re.sub(r'<think>.*?</think>\s*', '', raw_response, flags=re.DOTALL)
        return cleaned_response, chat_completion.usage.to_dict()
    except Exception as e:
        print(f"❌ Error calling DeepSeek API: {e}")
        return "There was an error communicating with the language model."

def generate_answer_with_deepseek(query: str, reranked_contexts: List[Dict]) -> str:
    """
    Generates a final answer by passing the query and reranked contexts to DeepSeek.
    """
    if not reranked_contexts:
        return "I could not find any relevant documents to answer the question."

    # --- 1. Prompt Engineering ---
    # Format the context documents for the prompt
    context_str = ""
    for i, context in enumerate(reranked_contexts):
        context_str += f"--- CONTEXT DOCUMENT {i+1} ---\n"
        context_str += f"Source URL: {context['metadata'].get('url', 'N/A')}\n"
        context_str += f"Content: {context['text']}\n\n"

    # Create the final prompt using a template
    prompt = f"""
    Based on the context documents provided below, please answer the user's question.

    CONTEXT DOCUMENTS:
    {context_str}
    
    USER'S QUESTION:
    "{query}"
    """

    # Define the system prompt to guide the LLM's behavior
    system_prompt = """
    You are a helpful and precise AI assistant. Your task is to answer the user's question based *only* on the provided context documents.
    - Do not use any external knowledge or make up information.
    - If the answer is not found in the context, state clearly: "I could not find an answer in the provided documents."
    - Synthesize the information from the documents into a clear, concise answer.
    - If you find specific information like emails or names, list them clearly.
    """

    # --- 2. Call the LLM ---
    print("Sending request to DeepSeek to generate the final answer...")
    final_answer ,usage = deepseek_ai_llm_call(prompt, system_prompt)
    
    return final_answer ,usage

def calculate_deepseek_cost(token_usage: dict) -> dict:
    # Pricing (USD per 1M tokens)
    PRICE_CACHE_HIT = 0.14 / 1_000_000
    PRICE_CACHE_MISS = 0.55 / 1_000_000
    PRICE_OUTPUT = 2.19 / 1_000_000

    # Extract token counts
    cache_hit_tokens = token_usage.get("prompt_cache_hit_tokens", 0)
    cache_miss_tokens = token_usage.get("prompt_cache_miss_tokens", 0)
    output_tokens = token_usage.get("completion_tokens", 0)

    # Calculate costs
    input_cache_hit_cost = cache_hit_tokens * PRICE_CACHE_HIT
    input_cache_miss_cost = cache_miss_tokens * PRICE_CACHE_MISS
    output_cost = output_tokens * PRICE_OUTPUT

    total_cost = input_cache_hit_cost + input_cache_miss_cost + output_cost

    return {
        "input_cache_hit_cost": round(input_cache_hit_cost, 8),
        "input_cache_miss_cost": round(input_cache_miss_cost, 8),
        "output_cost": round(output_cost, 8),
        "total_cost": round(total_cost, 8)
    }


def retrieval_pipeline_with_cohere(query: str, vector_store: Chroma, top_k: int = 7):
    retriever = vector_store.as_retriever(search_kwargs={"k": 50})
    initial_candidates = retriever.get_relevant_documents(query)
    contexts_for_reranking = [{"text": doc.page_content, "metadata": doc.metadata} for doc in initial_candidates]
    reranked_results = rerank_with_cohere(query, contexts_for_reranking, top_n=top_k)
    print(f"**Reranked chunks {reranked_results}")
    return reranked_results


# ============================================================================
# MAIN EXECUTION: FULL RAG PIPELINE
# ============================================================================



print("\n=== EXECUTING FULL RAG PIPELINE (RETRIEVE -> RERANK -> GENERATE) ===")

test_queries = [
    "what are the all emails to contact with university",
    "list google maps url and locations",
    "information about admission requirements"
]

for query in test_queries:
    print("\n" + "#"*80)
    print(f"PROCESSING QUERY: '{query}'")
    print("#"*80)
    
    # 1. Retrieve and Rerank
    print("\nStep 1 & 2: Retrieving and Reranking with Cohere...")
    reranked_results = retrieval_pipeline_with_cohere(query, vector_store, top_k=7)
    
    if not reranked_results:
        print("No relevant documents found after reranking. Cannot generate an answer.")
        continue
        
    # 2. Generate Answer
    print("\nStep 3: Generating final answer with DeepSeek...")
    final_answer ,usage = generate_answer_with_deepseek(query, reranked_results)
    
    # 3. Display Final Output
    print("\n" + "="*80)
    print(f"             FINAL SYNTHESIZED ANSWER FOR: '{query}'")
    print("="*80)
    
    print("ANSWER FROM DEEPSEEK:")
    print(final_answer)
    print(" ** Usage : ")
    cost = calculate_deepseek_cost(usage)
    print(cost)
    print(usage)
    
    print("\n---")
    print("SOURCES USED FOR THIS ANSWER:")
    for i, context in enumerate(reranked_results):
        print(f"  - Source {i+1}: {context['metadata'].get('url', 'N/A')} (Cohere Score: {context['rerank_score']:.4f})")

✅ Cohere client initialized successfully.
Loading existing vector store from: ./chroma_langchain_db
✅ Vector store loaded successfully.
Total items in database: 17,173

Reconstructing the full chunk list from the database...
✅ Reconstructed and sorted 17,173 chunks.

Initializing CrossEncoder for reranking...

=== EXECUTING FULL RAG PIPELINE (RETRIEVE -> RERANK -> GENERATE) ===

################################################################################
PROCESSING QUERY: 'what are the all emails to contact with university'
################################################################################

Step 1 & 2: Retrieving and Reranking with Cohere...
Sending 50 documents to Cohere Rerank API...
**Reranked chunks [{'text': "Administrative Offices Anasayfa » Administrative Offices General Secretariat [email\xa0protected] Directorate of R&D Resources [email\xa0protected] Directorate of Information Center [email\xa0protected] Directorate of Information Technologies [email\xa0prote

## 2- All toghter Retreival Part ( without expanded windows) - encoder reranker


In [23]:
import os
import re
from pathlib import Path
import gc
from typing import List, Dict
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from sentence_transformers import CrossEncoder # For local reranking
from dotenv import load_dotenv
from openai import OpenAI # For the DeepSeek client

# ============================================================================
# STEP 1: CONFIGURATION AND SETUP
# ============================================================================
# Load environment variables from .env file
load_dotenv()

PERSIST_DIRECTORY = "./chroma_langchain_db"
COLLECTION_NAME = "kadir_has_old_script_collection"

# Check for necessary API keys
if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found. Please set it for the embedding model.")
if not os.getenv('DEEPSEEK_API_KEY'):
    raise ValueError("DEEPSEEK_API_KEY not found. Please set it for the LLM.")

# ============================================================================
# STEP 2: LOAD THE EXISTING VECTOR STORE
# ============================================================================
print(f"\nLoading existing vector store from: {PERSIST_DIRECTORY}")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=PERSIST_DIRECTORY
)
print(f"✅ Vector store loaded successfully.")

# ============================================================================
# STAGE 5: RETRIEVAL & RERANKING (WITH LOCAL CROSS-ENCODER)
# ============================================================================

# --- 5a. Relevance Reranking with a local Cross-Encoder ---
print("\nInitializing local CrossEncoder for reranking...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank_with_cross_encoder(query: str, contexts: List[Dict]) -> List[Dict]:
    """Reranks a list of contexts using a local CrossEncoder model."""
    if not contexts:
        return []
    
    # The CrossEncoder expects pairs of [query, text]
    query_context_pairs = [[query, context['text']] for context in contexts]
    
    # Predict the scores
    scores = cross_encoder.predict(query_context_pairs)
    
    # Combine scores with original contexts
    for i in range(len(contexts)):
        contexts[i]['rerank_score'] = scores[i]
        
    # Sort by the new score in descending order
    return sorted(contexts, key=lambda x: x['rerank_score'], reverse=True)

# --- The Full Pipeline Function (using local CrossEncoder) ---
def retrieval_pipeline_with_cross_encoder(query: str, vector_store: Chroma, top_k: int = 7):
    """
    A pipeline that retrieves from Chroma and then reranks with a local CrossEncoder.
    """
    # Retrieve more candidates initially (e.g., 25) to give the reranker a good selection.
    retriever = vector_store.as_retriever(search_kwargs={"k": 25})
    
    # 1. Initial Retrieval
    initial_candidates = retriever.get_relevant_documents(query)
    
    # 2. Prepare contexts for reranking
    contexts_for_reranking = [{"text": doc.page_content, "metadata": doc.metadata} for doc in initial_candidates]
    
    # 3. Reranking with the local CrossEncoder
    reranked_results = rerank_with_cross_encoder(query, contexts_for_reranking)
    
    return reranked_results[:top_k]

# ============================================================================
# STAGE 6: ANSWER GENERATION WITH DEEPSEEK (UNCHANGED)
# ============================================================================

def deepseek_ai_llm_call(prompt: str, system_prompt: str) -> str:
    """Calls the DeepSeek API with a given prompt and system message."""
    try:
        deepseek_client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
        chat_completion = deepseek_client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            model="deepseek-chat",
            temperature=0.1,
            max_tokens=4096
        )
        raw_response = chat_completion.choices[0].message.content.strip()
        cleaned_response = re.sub(r'<think>.*?</think>\s*', '', raw_response, flags=re.DOTALL)
        return cleaned_response
    except Exception as e:
        print(f"❌ Error calling DeepSeek API: {e}")
        return "There was an error communicating with the language model."

def generate_answer_with_deepseek(query: str, reranked_contexts: List[Dict]) -> str:
    """Generates a final answer by passing the query and reranked contexts to DeepSeek."""
    if not reranked_contexts:
        return "I could not find any relevant documents to answer the question."

    context_str = ""
    for i, context in enumerate(reranked_contexts):
        context_str += f"--- CONTEXT DOCUMENT {i+1} ---\n"
        context_str += f"Source URL: {context['metadata'].get('url', 'N/A')}\n"
        context_str += f"Content: {context['text']}\n\n"

    prompt = f"""
    Based on the context documents provided below, please answer the user's question.

    CONTEXT DOCUMENTS:
    {context_str}
    
    USER'S QUESTION:
    "{query}"
    """

    system_prompt = """
    You are a helpful and precise AI assistant. Your task is to answer the user's question based *only* on the provided context documents.
    - Do not use any external knowledge or make up information.
    - If the answer is not found in the context, state clearly: "I could not find an answer in the provided documents."
    - Synthesize the information from the documents into a clear, concise answer.
    - If you find specific information like emails or names, list them clearly.
    """
    
    final_answer = deepseek_ai_llm_call(prompt, system_prompt)
    return final_answer

# ============================================================================
# MAIN EXECUTION: FULL RAG PIPELINE (WITH LOCAL RERANKER)
# ============================================================================

print("\n=== EXECUTING FULL RAG PIPELINE (RETRIEVE -> LOCAL RERANK -> GENERATE) ===")

test_queries = [
    "what are the all emails to contact",
    "list google maps url and locations",
    "information about admission requirements"
]

for query in test_queries:
    print("\n" + "#"*80)
    print(f"PROCESSING QUERY: '{query}'")
    print("#"*80)
    
    # 1. Retrieve and Rerank with local CrossEncoder
    print("\nStep 1 & 2: Retrieving and Reranking with local CrossEncoder...")
    reranked_results = retrieval_pipeline_with_cross_encoder(query, vector_store, top_k=7)
    
    if not reranked_results:
        print("No relevant documents found after reranking. Cannot generate an answer.")
        continue
    
    # =======================================================================
    # NEW: PRINT THE RERANKED CHUNKS BEFORE SENDING TO LLM
    # =======================================================================
    print("\n--- Top 5 Reranked Chunks (to be sent to LLM) ---")
    for i, result in enumerate(reranked_results):
        # The CrossEncoder score is a raw logit, not a probability
        print(f"  Chunk {i+1} (Score: {result['rerank_score']:.4f}) | URL: {result['metadata'].get('url', 'N/A')}")
        print(f"    Text: {result['text'][:150]}...") # Print a short preview
    print("----------------------------------------------------")
    # =======================================================================
        
    # 2. Generate Answer
    print("\nStep 3: Generating final answer with DeepSeek...")
    final_answer = generate_answer_with_deepseek(query, reranked_results)
    
    # 3. Display Final Output
    print("\n" + "="*80)
    print(f"             FINAL SYNTHESIZED ANSWER FOR: '{query}'")
    print("="*80)
    
    print("ANSWER FROM DEEPSEEK:")
    print(final_answer)
    
    print("\n---")
    print("SOURCES USED FOR THIS ANSWER:")
    for i, context in enumerate(reranked_results):
        print(f"  - Source {i+1}: {context['metadata'].get('url', 'N/A')} (Local Score: {context['rerank_score']:.4f})")


Loading existing vector store from: ./chroma_langchain_db
✅ Vector store loaded successfully.

Initializing local CrossEncoder for reranking...

=== EXECUTING FULL RAG PIPELINE (RETRIEVE -> LOCAL RERANK -> GENERATE) ===

################################################################################
PROCESSING QUERY: 'what are the all emails to contact'
################################################################################

Step 1 & 2: Retrieving and Reranking with local CrossEncoder...

--- Top 5 Reranked Chunks (to be sent to LLM) ---
  Chunk 1 (Score: -7.4387) | URL: https://portcities.khas.edu.tr/contact/
    Text: CONTACT Home CONTACT Academic Coordinator - FT Lecturer Ali Dur +90 212 533 6532 - 1302 [email protected] +90 212 533 6532 - 1654 [email protected] Ci...
  Chunk 2 (Score: -7.8758) | URL: https://kariyer.khas.edu.tr/en/node/133
    Text: EN TR Academic Company Student / Alumni Uluslararası İş Dünyasında Kariyer Önerileri Contact Address: Kadir Has Caddesi Cib