### Import Libraries

In [None]:
! pip install python-dotenv pandas

In [None]:
! pip install openpyxl

In [2]:
import json
import os
import time
import re
from typing import Dict, List
from urllib.parse import urlsplit

from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import pandas as pd

In [3]:
load_dotenv()

True

In [4]:
pd.set_option('display.max_rows', 200)

In [5]:
OPENAI_API_KEY      = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL    = os.getenv("OPENAI_LLM_MODEL")

PERPLEXITY_API_KEY  = os.getenv("PERPLEXITY_API_KEY")
NOVITA_AI_API_KEY   = os.getenv("NOVITA_AI_API_KEY")
NOVITA_AI_LLM_MODEL = os.getenv("NOVITA_AI_LLM_MODEL")

In [6]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ['LANGCHAIN_PROJECT'] = "emails"
from langsmith import utils
utils.tracing_is_enabled()

True

In [7]:
client = OpenAI(api_key=OPENAI_API_KEY)


def openai_llm_call(prompt: str,
                    openai_model_name: str = OPENAI_LLM_MODEL) -> str:
    chat_completion = client.chat.completions.create(
                                                    messages=[
                                                        {
                                                            "role": "user",
                                                            "content": prompt,
                                                        }
                                                    ],
                                                    model=openai_model_name,
                                                    temperature=0.01
                                                )
    return chat_completion.choices[0].message.content.strip(), chat_completion.usage.to_dict()

# Cost of Sonar Reasoning Pro call

In [8]:
# sonar_reasoning_pro_cost.py
PRICING = {
    "low":   {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 6 / 1000},
    "medium": {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 10 / 1000},
    "high":  {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 14 / 1000},
}

def call_sonar_reasoning_pro_cost(meta: dict, model: str = "sonar-reasoning-pro") -> float:
    """
    Return the USD cost for a single API call, given the usage-metadata dictionary.
    """
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]                     # pick the right row

    # per-token prices → per-token multipliers
    in_price  = p["input_per_million"]  / 1_000_000
    out_price = p["output_per_million"] / 1_000_000

    prompt  = meta.get("prompt_tokens", 0)
    complet = meta.get("completion_tokens", 0)

    token_cost = prompt * in_price + complet * out_price
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)   # dollars

if __name__ == "__main__":
    usage = {'completion_tokens': 694,
             'prompt_tokens': 37,
             'total_tokens': 731,
             'search_context_size': 'low'}

    print(f"Call price: ${call_sonar_reasoning_pro_cost(usage):.4f}")



Call price: $0.0116


# cost for sonar pro model

In [9]:
# sonar_pro_cost.py


def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

if __name__ == "__main__":
    usage = {
        'completion_tokens': 694,
        'prompt_tokens': 37,
        'total_tokens': 731,
        'search_context_size': 'low'
    }
    print(f"Call price: ${calc_sonar_pro_cost(usage):.4f}")


Call price: $0.0165


# Perplexity Logic Alone 

#### Perpleixty protocol instead of openai

In [None]:
import requests  # Add this import

def perplexity_call(perplexity_query: str,
                    perplexity_model: str = "sonar-reasoning-pro", #sonar-pro , #sonar-reasoning-pro
                    temperature: float = 0.01) -> tuple:
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": perplexity_model,
        "messages": [{
            "role": "system",
            "content": "You are a specialized web crawler focused on university domains"
        }, {
            "role": "user",
            "content": perplexity_query
        }],
        "temperature": temperature,
        "search_domain_filter": ["toros.edu.tr"],  # beykent.edu.t
        "return_citations": True,
        # "web_search_options": {
        #     "search_context_size": "high"  # This is the key addition
        # }
    }

    response = requests.post(url, headers=headers, json=payload)
    
    # Add error handling
    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        print(response.text)
        return "", {}, []
    
    response_json = response.json()
    
    
    # Extract required information from response
    content = response_json['choices'][0]['message']['content']
    usage = response_json['usage']
    citations = response_json.get('citations', [])
    
    return content, usage, citations



### the query for single call for test











In [39]:
perplexity_query = f""" get me all information about (all location , return all loction titles with its address)  for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f""" get me all information about (phone numbers)  for "medipol.edu.tr" university
only search in this domain
"""
perplexity_query = f""" get me the essential information about (About Us information)  for "isikun.edu.tr" university
only search in this domain
"""

perplexity_query = f""" for "medipol.edu.tr" university
search only in domain ,summary the basic vision of the university in general. not for specific departments but for university in general.
"""

perplexity_query = f""" for "medipol.edu.tr" university
search only in domain ,summary the basic Mission of the university in general. not for specific departments but for university in general.
"""


perplexity_query = f""" get me all information about Partnerships for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f"""what is the Awards and Honors for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f""" for "isikun.edu.tr" university
search only in domain ,copy for me the all google maps URLS for the university.if not found return "no information found"
"""

perplexity_query = f"""List all faculties and departments at Istanbul Medipol University with their official pages, search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all Study Languages or if there combined languages, at Istanbul Medipol University, search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""get me numbers of (Total Students , Undergraduate Students , Graduate Students , Local Students , International Students , Total Alumni),
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Accreditation for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Blocked Nationalities for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the University Motto for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Times Higher Education Ranking   for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Year Established for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Admission Requirements for "medipol.edu.tr" university,
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Scholarships and Financial Aids , list me all scholarships and financial aids the universoty provides for medipol university, 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Discounts provides by medipol university for the students, list me it all 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Notable Alumni of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Partner Institutions of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""list me all information about Industry Collaborations of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Partner Institutions of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Student Satisfaction Ratings and Graduate Employment Rates  for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Early Admission Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Deferred Admission Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Transfer Student Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Application Process Steps for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

# perplexity_query = f""" what is the Tuition Currency and Official Tuition and Discounted Tuition for medipol university,  in general for all programs only for local students
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the Tuition Currency and Official Tuition and Discounted Tuition for medipol university,  in general for all programs only for international students
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the deposit required for medipol university if there specific deposit for each degree and with what amount and with what currency,  in general for all programs 
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the application fee required for medipol university if there specific application fee for each degree and with what amount and with what currency,  in general for all programs 
# search only in domain "medipol.edu.tr"
# """

perplexity_query = f""" what is the contact information retreive ( contacts names , Emails,  organization phone numbers) for medipol university, 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the social media accounts with its links for medipol university, for (Facebook, X , Youtube , Insagram , Linkdin , Tiktok , Vk)
"""

perplexity_query = f"""List all information about (Library Information), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Research Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Housing Options), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Dining Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Sports Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
""" 

perplexity_query = f"""List all information about (Student Organizations), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""             

perplexity_query = f"""List all information about (Health Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Career Counseling Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Disability Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""         

perplexity_query = f"""List all information about (Study Abroad and Exchange Students Details), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Campus Security Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
""" 

perplexity_query = f"""List all information about (Technology Resources), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""     

perplexity_query = f"""List all information about (Transportation Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Cultural Centers), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Recreation Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""     

perplexity_query = f"""List all information about (Childcare Services), at Istanbul Medipol University, Childcare Services mean seerivces for children of students not programs, if not fount return  " no information found"
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Financial Services), at Istanbul Medipol University, Financial Services mean services for students to manage their finances,not programs, if not fount return  " no information found"
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Religious/Spiritual Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""         

perplexity_query = f"""List all information about (Student Support Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""




## Run single query

In [11]:
import requests  # Add this import

domain = "cyberjaya.edu.my"
def perplexity_call(perplexity_query: str,
                    perplexity_model: str = "sonar-reasoning-pro", #sonar-pro , #sonar-reasoning-pro
                    temperature: float = 0.01) -> tuple:
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": perplexity_model,
        "messages": [{
            "role": "system",
            "content": (
            "You are a specialized web crawler and information extractor. "
            "Focus exclusively on the provided domain. "
            "Answer the user's query concisely and accurately based on the website's content. "
            "Return the answer directly without any additional text before the answer or after the answer. "
            "If you don't have the ability to actively search or crawl websites, return 'no information found'. "
            "If information is not found, clearly return 'no information found'. "
            "Always return the information in English."
        )
        },
        {
            "role": "user",
            "content": perplexity_query
        }],
        "temperature": temperature,
        "search_domain_filter": [domain],  # beykent.edu.t
        "return_citations": True,
        # "web_search_options": {
        #     "search_context_size": "high"  # This is the key addition
        # }
    }

    response = requests.post(url, headers=headers, json=payload)
    
    # Add error handling
    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        print(response.text)
        return "", {}, []
    
    response_json = response.json()
    
    
    # Extract required information from response
    content = response_json['choices'][0]['message']['content']
    usage = response_json['usage']
    citations = response_json.get('citations', [])
    
    return content, usage, citations


perplexity_query = f"""for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general
"""


print("===== PERPLEXITY QUERY =====")
print(perplexity_query)
print("===========================")
# Call with emails_only=True to ensure we only get emails
perplexity_response, perplexity_tokens_usage, perplexity_citations = perplexity_call(
    perplexity_query=perplexity_query
)

print("===== PERPLEXITY RESPONSE =====")
print(perplexity_response)
print("===========================")

print("===== TOKEN USAGE =====")
print(perplexity_tokens_usage)

print(f"Call price: ${call_sonar_reasoning_pro_cost(perplexity_tokens_usage):.4f}")
print("===========================")

print("===== CITATIONS =====")
for citation in perplexity_citations:
    print(citation)
print("===========================")


===== PERPLEXITY QUERY =====
for cyberjaya.edu.my university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general

===== PERPLEXITY RESPONSE =====
<think>
The user is asking for the basic vision of the University of Cyberjaya (cyberjaya.edu.my) in general, not for specific departments. I need to look through the search results to find the university's overall vision statement.

Looking at the search results:

1. Search result [1] from cyberjaya.edu.my/university/background/about mentions:
- "Our Mission. To transform societies with holistic learning and outstanding student experiences."
- "To uplift communities with equitable access to quality education."
- It talks about the university becoming full-fledged in 2019 and their brand identity.

2. Search result [2] from web.cyberjaya.edu.my/class2020-vision-mission/ states:
- Vision: "We aspire to be a distinctive institution of higher learning producing

## Main Function - save excel file

In [None]:
import requests
import os
import re
import time
import pandas as pd
import tldextract
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Direct domain input instead of Excel file
TARGET_DOMAINS = ["medipol.edu.tr"]  #"cyberjaya.edu.my","yu.edu.sa","faculdadeadventista.edu.br","iuk.ac.jp","leeds.ac.uk","ue.edu.ph","unipa.it"

# Output folder
OUTPUT_FOLDER = "universities_informations"

# Parallel processing configuration
MAX_WORKERS = 5  # Number of parallel threads (reduced for stability)
BATCH_SIZE = 10   # Number of domains to process in each batch

# Define which queries should use high context size
HIGH_CONTEXT_QUERIES = {
    'Faculties and Departments',
    'Scholarships',
    'Accreditation',
    'Locations and Addresses',
    'Admission Requirements',
    'Contact Information',
    'Application Process',
    'Study Languages',
    'Transfer Student Policy'
}

# Thread-safe lock for cost tracking and console output
cost_lock = Lock()
print_lock = Lock()

def thread_safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)

def validate_and_clean_domains(domains: list) -> list:
    """Validate and clean the provided domain list."""
    if not domains:
        print("Error: No domains provided in TARGET_DOMAINS list")
        return []
    
    valid_domains = []
    invalid_domains = []
    duplicate_tracker = {}
    
    print("\nProcessing provided domains...")
    
    for i, domain in enumerate(domains):
        if not domain or not isinstance(domain, str):
            invalid_domains.append(f"Index {i}: Empty or non-string")
            continue
            
        domain = domain.strip().lower()
        
        # Extract domain components using tldextract for validation
        ext = tldextract.extract(domain)
        clean_domain = f"{ext.domain}.{ext.suffix}".lower()
        
        # Validate domain format
        if (clean_domain and clean_domain != "." and ext.domain and ext.suffix and 
            len(ext.domain) > 0 and len(ext.suffix) > 0 and 
            '.' in clean_domain and not clean_domain.startswith('.') and not clean_domain.endswith('.')):
            
            # Track duplicates
            if clean_domain in duplicate_tracker:
                duplicate_tracker[clean_domain].append(i + 1)
            else:
                duplicate_tracker[clean_domain] = [i + 1]
            
            valid_domains.append(clean_domain)
            if i < 5:  # Show first 5 for debugging
                print(f"  Domain {i+1}: {domain} -> {clean_domain}")
        else:
            invalid_domains.append(f"Index {i+1}: {domain}")
            if len(invalid_domains) <= 3:  # Show first 3 invalid domains
                print(f"  Invalid domain at index {i+1}: {domain}")
    
    # Remove duplicates while preserving order
    original_count = len(valid_domains)
    unique_domains = list(dict.fromkeys(valid_domains))
    duplicates_removed = original_count - len(unique_domains)
    
    # Find and print duplicate domains
    duplicates = {domain: indices for domain, indices in duplicate_tracker.items() if len(indices) > 1}
    
    print(f"\nDomain processing summary:")
    print(f"  Total domains provided: {len(domains)}")
    print(f"  Invalid domains: {len(invalid_domains)}")
    print(f"  Valid domains found: {original_count}")
    print(f"  Duplicate domains removed: {duplicates_removed}")
    print(f"  Final unique domains: {len(unique_domains)}")
    
    # Print invalid domains
    if invalid_domains:
        print(f"\n❌ INVALID DOMAINS ({len(invalid_domains)}):")
        for invalid in invalid_domains[:10]:  # Show first 10 invalid
            print(f"  {invalid}")
        if len(invalid_domains) > 10:
            print(f"  ... and {len(invalid_domains) - 10} more")
    
    # Print duplicate domains
    if duplicates:
        print(f"\n📋 DUPLICATE DOMAINS FOUND ({len(duplicates)} duplicates):")
        for domain, indices in duplicates.items():
            print(f"  🔄 {domain} (appears at indices: {', '.join(map(str, indices))})")
    else:
        print(f"\n✅ No duplicate domains found")
    
    return unique_domains

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {'title': 'About Us', 'query': f'get me the information about (About Us information) for {domain} university, We do not need the Mission and Vision, Educational Philosophy, or Campuses and Infrastructure sections. We only need the About Us section, if found about us section copy it exactly only search in this domain'},
        {'title': 'Vision', 'query': f'for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': f'for {domain} university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': f'get me all information about (all location, return all location titles with its address) for {domain} university, only search in this domain'},
        {'title': 'Degrees', 'query': f'get me all information about (Degrees) avalible for {domain} university like (Bachelor, Master, PhD, etc.), only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': f'get me all phone numbers for {domain} university, only search in this domain'},
        {'title': 'Partnerships', 'query': f'get me all information about Partnerships for {domain} university, only search in this domain'},
        {'title': 'Awards and Honors', 'query': f'what is the Awards and Honors for {domain} university, only search in this domain'},
        {'title': 'Study Languages', 'query': f'what is the study languages in its various programs, search only in domain {domain}, for example "English, German, French, etc.'},
        {'title': 'Student Statistics', 'query': f'how many (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) in this university {domain}, search for evey number of these if found search only in domain'},
        {'title': 'Accreditation', 'query': f'what is the Accreditation for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Blocked Nationalities', 'query': f'what is the Blocked Nationalities for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'University Motto', 'query': f'what is the University Motto for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'THE Ranking', 'query': f'What is the significance of {domain} university in Times Higher Education Impact Rankings?, If not found return "no information found", search only in domain'},
        {'title': 'Year Established', 'query': f'what is the Year Established for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Admission Requirements', 'query': f'what is the Admission Requirements for for differents Degrees in {domain} university, search only in domain'},
        {'title': 'Scholarships', 'query': f'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by {domain}, search only in domain'},
        {'title': 'Student Discounts', 'query': f'what are the Discounts provided by {domain} for students,i dont want discounts was provided in the past i need any information abount disounts provided to for exmaple: Sibling Discounts, or international stundens or any kind of discounts, search only in domain'},
        {'title': 'Notable Alumni', 'query': f'Who are the Notable Alumni of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Partner Institutions', 'query': f'Who are the Partner Institutions of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Industry Collaborations', 'query': f'list all information about Industry Collaborations of {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Satisfaction', 'query': f'what is the Student Satisfaction Ratings and Graduate Employment Rates for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Early Admission Policy', 'query': f'what is the Early Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Deferred Admission Policy', 'query': f'what is the Deferred Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transfer Student Policy', 'query': f'what is the Transfer Student Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Application Process', 'query': f'what is the Application Process Steps for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Contact Information', 'query': f'what is the contact information (contacts names, Emails, organization phone numbers) for {domain}, search only in domain'},
        {'title': 'Social Media', 'query': f'what are the social media accounts with links for {domain} (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        {'title': 'Library Information', 'query': f'List all information about Library Information at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Research Facilities', 'query': f'List all information about Research Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Housing Options', 'query': f'List all information about Housing Options at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Dining Services', 'query': f'List all information about Dining Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Sports Facilities', 'query': f'List all information about Sports and ATHLETIC Facilities like soccer , basketball or any other sports at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Organizations', 'query': f'List all information about Student Organizations at {domain}, If not found return "no information found",return student organizations and communities are available to cater to diverse academic, social, and cultural interests? ,search only in domain'},
        {'title': 'Health Services', 'query': f'List all information about Health Services for students and staff at {domain} university, only search in this domain , search or health facilities not programs'},
        {'title': 'Career Counseling', 'query': f'List all information about Career Counseling Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Disability Services', 'query': f'List all information about Disability Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Study Abroad Programs', 'query': f'List all information about Study Abroad and Exchange Students Details at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Campus Security', 'query': f'List all information about Campus Security Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Technology Resources', 'query': f'List all information about Technology Resources at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transportation Services', 'query': f'List all information about Transportation Services at {domain},dont return information about programs i need services provided to students like shuttle bus etc..., If not found return "no information found", search only in domain'},
        {'title': 'Cultural Centers', 'query': f'List all information about Cultural Centers at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Recreation Facilities', 'query': f'List all information about Recreation Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Childcare Services', 'query': f'List all information about Childcare Services :(Services provided to children such as students children, do not return any information about programs) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Financial Services', 'query': f'List all information about Financial Services (services for students to manage finances) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Religious Services', 'query': f'List all information about Religious/Spiritual Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Support Services', 'query': f'List all information about Student Support Services at {domain}, If not found return "no information found", search only in domain'}
    ]
    
    return [{'title': qt['title'], 'query': qt['query']} for qt in query_templates]

def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text

def check_no_information_found(response_content: str) -> str:
    """
    Check if the response contains 'no information found' phrase.
    If found, return empty string. Otherwise, return the original response.
    """
    if response_content and "no information found" in response_content.lower():
        return ""
    return response_content

def format_citations(citations: list) -> str:
    """Format citations list into a single string."""
    if not citations:
        return "No citations provided"
    
    formatted_citations = []
    for i, citation in enumerate(citations, 1):
        formatted_citations.append(f"{i}. {citation}")
    
    return "\n".join(formatted_citations)

def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

def perplexity_call(perplexity_query: str, domain: str, query_title: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        thread_safe_print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    
    # Validate domain format before making API call
    if not domain or '.' not in domain or domain.startswith('.') or domain.endswith('.'):
        thread_safe_print(f"API Error: Invalid domain format: {domain}")
        return "Failed to retrieve information - invalid domain format.", {}, []
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    
    # Base payload
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system",
            "content": (
            "You are a specialized web crawler and information extractor. "
            "Focus exclusively on the provided domain. "
            "Answer the user's query concisely and accurately based on the website's content. "
            "Return the answer directly without any additional text before the answer or after the answer. "
            "If you don't have the ability to actively search or crawl websites, return 'no information found'. "
            "If information is not found, clearly return 'no information found'. "
            "Always return the information in English."
        )},
            {"role": "user",
            "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True
    }
    
    # Add web_search_options for specific queries
    if query_title in HIGH_CONTEXT_QUERIES:
        payload["web_search_options"] = {
            "search_context_size": "high"
        }
        thread_safe_print(f"  → Using HIGH context size for '{query_title}'")
    else:
        thread_safe_print(f"  → Using default context size for '{query_title}'")
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        thread_safe_print(f"API Request Error for {domain}: {e}")
        return f"Failed to retrieve information due to API error: {e}", {}, []
    
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    
    return content, usage, citations

def process_single_domain(domain: str, domain_index: int, total_domains: int) -> dict:
    """Process a single domain and return results."""
    thread_id = threading.current_thread().name
    thread_safe_print(f"\n[Thread {thread_id}] STARTING DOMAIN [{domain_index}/{total_domains}]: {domain}")
    
    try:
        # Generate dynamic filename for the current domain in subdirectory
        output_filename = os.path.join(OUTPUT_FOLDER, f"{domain}_data.xlsx")
        queries = generate_queries_for_domain(domain)
        
        # Initialize cost tracker and data storage for the current domain
        domain_total_cost = 0.0
        data_rows = []
        
        total_queries = len(queries)
        for i, item in enumerate(queries):
            title = item['title']
            query = item['query']
            
            thread_safe_print(f"[Thread {thread_id}] [{i+1}/{total_queries}] Processing '{title}' for {domain}...")
            
            response_content, usage, citations = perplexity_call(
                perplexity_query=query,
                domain=domain,
                query_title=title
            )
            
            if usage:
                domain_total_cost += calc_sonar_pro_cost(usage)
            
            if response_content:
                response_content = clean_citation_references(response_content)
                # Check for "no information found" and return empty string if found
                response_content = check_no_information_found(response_content)
            else:
                response_content = "Failed to retrieve information for this query."
            
            # Format citations for Excel
            formatted_citations = format_citations(citations)
            
            # Add row to data
            data_rows.append({
                'Title': title,
                'Response Content': response_content,
                'Citations': formatted_citations
            })
            
            thread_safe_print(f"  [Thread {thread_id}] Finished: '{title}'. Data collected.")
            time.sleep(1)  # Be polite to the API

        # Create DataFrame and save to Excel
        df = pd.DataFrame(data_rows)
        
        # Save to Excel with formatting (using original naming convention)
        with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name=f'{domain}_data', index=False)
            
            # Get the workbook and worksheet for formatting
            workbook = writer.book
            worksheet = writer.sheets[f'{domain}_data']
            
            # Auto-adjust column widths
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                
                # Set a reasonable maximum width
                adjusted_width = min(max_length + 2, 50)
                worksheet.column_dimensions[column_letter].width = adjusted_width
            
            # Enable text wrapping for all cells
            from openpyxl.styles import Alignment
            for row in worksheet.iter_rows():
                for cell in row:
                    cell.alignment = Alignment(wrap_text=True, vertical='top')

        # Return results for this domain
        result = {
            'domain': domain,
            'domain_index': domain_index,
            'output_filename': output_filename,
            'total_rows': len(data_rows),
            'domain_cost': domain_total_cost,
            'thread_id': thread_id
        }
        
        thread_safe_print(f"[Thread {thread_id}] COMPLETED DOMAIN: {domain} | Rows: {len(data_rows)} | Cost: ${domain_total_cost:.4f}")
        return result
        
    except Exception as e:
        thread_safe_print(f"[Thread {thread_id}] ERROR processing {domain}: {e}")
        return {
            'domain': domain,
            'domain_index': domain_index,
            'error': str(e),
            'thread_id': thread_id
        }

def process_domains_in_parallel(domains_batch: list, batch_start_index: int) -> list:
    """Process a batch of domains in parallel using ThreadPoolExecutor."""
    thread_safe_print(f"\n{'='*80}")
    thread_safe_print(f"STARTING PARALLEL BATCH: {len(domains_batch)} domains")
    thread_safe_print(f"Domains: {', '.join(domains_batch[:5])}{'...' if len(domains_batch) > 5 else ''}")
    thread_safe_print(f"{'='*80}")
    
    results = []
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_domain = {}
        for i, domain in enumerate(domains_batch):
            domain_index = batch_start_index + i + 1
            future = executor.submit(process_single_domain, domain, domain_index, len(domains_batch))
            future_to_domain[future] = domain
        
        # Collect results as they complete
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                thread_safe_print(f"ERROR: Exception occurred for domain {domain}: {e}")
                results.append({
                    'domain': domain,
                    'error': str(e)
                })
    
    return results

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Process the provided domains directly
    target_domains = TARGET_DOMAINS
    
    if not target_domains:
        print("No valid domains found in TARGET_DOMAINS list. Exiting.")
        return

    # Create output folder if it doesn't exist
    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)
    print(f"Output folder '{OUTPUT_FOLDER}' created/verified")

    print(f"\nHIGH CONTEXT QUERIES: {', '.join(HIGH_CONTEXT_QUERIES)}")
    print(f"Total queries with high context: {len(HIGH_CONTEXT_QUERIES)}")
    print(f"\nStarting parallel processing with {MAX_WORKERS} threads")
    print(f"Processing {len(target_domains)} domains in batches of {BATCH_SIZE}")

    # Initialize overall cost tracker
    total_overall_cost = 0.0
    total_rows_collected = 0
    successful_domains = 0
    failed_domains = 0
    
    # Process domains in batches
    for batch_start in range(0, len(target_domains), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(target_domains))
        domains_batch = target_domains[batch_start:batch_end]
        
        batch_number = (batch_start // BATCH_SIZE) + 1
        total_batches = (len(target_domains) + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\n{'*'*80}")
        print(f"PROCESSING BATCH {batch_number}/{total_batches}")
        print(f"Domains {batch_start + 1} to {batch_end} of {len(target_domains)}")
        print(f"{'*'*80}")
        
        # Process this batch in parallel
        batch_results = process_domains_in_parallel(domains_batch, batch_start)
        
        # Aggregate results from this batch
        batch_cost = 0.0
        batch_rows = 0
        
        for result in batch_results:
            if 'error' not in result:
                batch_cost += result.get('domain_cost', 0)
                batch_rows += result.get('total_rows', 0)
                successful_domains += 1
                print(f"✓ {result['domain']}: {result['total_rows']} rows, ${result['domain_cost']:.4f}")
            else:
                failed_domains += 1
                print(f"✗ {result['domain']}: ERROR - {result.get('error', 'Unknown error')}")
        
        total_overall_cost += batch_cost
        total_rows_collected += batch_rows
        
        print(f"\nBatch {batch_number} Summary:")
        print(f"  Batch Cost: ${batch_cost:.4f}")
        print(f"  Batch Rows: {batch_rows}")
        print(f"  Running Total Cost: ${total_overall_cost:.4f}")
        print(f"  Running Total Rows: {total_rows_collected}")

    print("\n" + "="*80)
    print("ALL DOMAINS PROCESSING COMPLETED!")
    print("="*80)
    print(f"Total domains processed: {len(target_domains)}")
    print(f"Successful domains: {successful_domains}")
    print(f"Failed domains: {failed_domains}")
    print(f"Total rows collected: {total_rows_collected}")
    print(f"Total estimated cost: ${total_overall_cost:.4f}")
    print(f"All Excel files saved in: {OUTPUT_FOLDER}")
    print("="*80)

# Run the main function
if __name__ == "__main__":
    main()

Output folder 'universities_informations' created/verified

HIGH CONTEXT QUERIES: Accreditation, Admission Requirements, Study Languages, Locations and Addresses, Faculties and Departments, Scholarships, Contact Information, Application Process, Transfer Student Policy
Total queries with high context: 9

Starting parallel processing with 5 threads
Processing 7 domains in batches of 10

********************************************************************************
PROCESSING BATCH 1/1
Domains 1 to 7 of 7
********************************************************************************

STARTING PARALLEL BATCH: 7 domains
Domains: cyberjaya.edu.my, yu.edu.sa, faculdadeadventista.edu.br, iuk.ac.jp, leeds.ac.uk...

[Thread ThreadPoolExecutor-1_0] STARTING DOMAIN [1/7]: cyberjaya.edu.my
[Thread ThreadPoolExecutor-1_0] [1/47] Processing 'About Us' for cyberjaya.edu.my...
  → Using default context size for 'About Us'

[Thread ThreadPoolExecutor-1_1] STARTING DOMAIN [2/7]: yu.edu.sa
[Thread Th

## Main Function - save JSON file

In [40]:
import requests
import os
import re
import time
import json
import tldextract
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Direct domain input instead of Excel file
TARGET_DOMAINS = ["kcgu.ac.kr"]  #"faculdadeadventista.edu.br","iuk.ac.jp","leeds.ac.uk","ue.edu.ph","unipa.it","yu.edu.sa","cyberjaya.edu.my","acsa.sa.edu","akad.de","iuk.ac.jp"

# Output folder
OUTPUT_FOLDER = "universities_informations"

# Parallel processing configuration
MAX_WORKERS = 5  # Number of parallel threads (reduced for stability)
BATCH_SIZE = 10   # Number of domains to process in each batch

# Define which queries should use high context size
HIGH_CONTEXT_QUERIES = {
    'Faculties and Departments',
    'Scholarships',
    'Accreditation',
    'Locations and Addresses',
    'Admission Requirements',
    'Contact Information',
    'Application Process',
    'Study Languages',
    'Transfer Student Policy'
}

# Thread-safe lock for cost tracking and console output
cost_lock = Lock()
print_lock = Lock()

def thread_safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)

def validate_and_clean_domains(domains: list) -> list:
    """Validate and clean the provided domain list."""
    if not domains:
        print("Error: No domains provided in TARGET_DOMAINS list")
        return []
    
    valid_domains = []
    invalid_domains = []
    duplicate_tracker = {}
    
    print("\nProcessing provided domains...")
    
    for i, domain in enumerate(domains):
        if not domain or not isinstance(domain, str):
            invalid_domains.append(f"Index {i}: Empty or non-string")
            continue
            
        domain = domain.strip().lower()
        
        # Extract domain components using tldextract for validation
        ext = tldextract.extract(domain)
        clean_domain = f"{ext.domain}.{ext.suffix}".lower()
        
        # Validate domain format
        if (clean_domain and clean_domain != "." and ext.domain and ext.suffix and 
            len(ext.domain) > 0 and len(ext.suffix) > 0 and 
            '.' in clean_domain and not clean_domain.startswith('.') and not clean_domain.endswith('.')):
            
            # Track duplicates
            if clean_domain in duplicate_tracker:
                duplicate_tracker[clean_domain].append(i + 1)
            else:
                duplicate_tracker[clean_domain] = [i + 1]
            
            valid_domains.append(clean_domain)
            if i < 5:  # Show first 5 for debugging
                print(f"  Domain {i+1}: {domain} -> {clean_domain}")
        else:
            invalid_domains.append(f"Index {i+1}: {domain}")
            if len(invalid_domains) <= 3:  # Show first 3 invalid domains
                print(f"  Invalid domain at index {i+1}: {domain}")
    
    # Remove duplicates while preserving order
    original_count = len(valid_domains)
    unique_domains = list(dict.fromkeys(valid_domains))
    duplicates_removed = original_count - len(unique_domains)
    
    # Find and print duplicate domains
    duplicates = {domain: indices for domain, indices in duplicate_tracker.items() if len(indices) > 1}
    
    print(f"\nDomain processing summary:")
    print(f"  Total domains provided: {len(domains)}")
    print(f"  Invalid domains: {len(invalid_domains)}")
    print(f"  Valid domains found: {original_count}")
    print(f"  Duplicate domains removed: {duplicates_removed}")
    print(f"  Final unique domains: {len(unique_domains)}")
    
    # Print invalid domains
    if invalid_domains:
        print(f"\n❌ INVALID DOMAINS ({len(invalid_domains)}):")
        for invalid in invalid_domains[:10]:  # Show first 10 invalid
            print(f"  {invalid}")
        if len(invalid_domains) > 10:
            print(f"  ... and {len(invalid_domains) - 10} more")
    
    # Print duplicate domains
    if duplicates:
        print(f"\n📋 DUPLICATE DOMAINS FOUND ({len(duplicates)} duplicates):")
        for domain, indices in duplicates.items():
            print(f"  🔄 {domain} (appears at indices: {', '.join(map(str, indices))})")
    else:
        print(f"\n✅ No duplicate domains found")
    
    return unique_domains

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {'title': 'About Us', 'query': f'get me the information about (About Us information) for {domain} university, We do not need the Mission and Vision, Educational Philosophy, or Campuses and Infrastructure sections. We only need the About Us section, if found about us section copy it exactly only search in this domain'},
        {'title': 'Vision', 'query': f'for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': f'for {domain} university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': f'get me all information about (all location, return all location titles with its address) for {domain} university, only search in this domain'},
        {'title': 'Degrees', 'query': f'get me all information about (Degrees) avalible for {domain} university like (Bachelor, Master, PhD, etc.), only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': f'get me all phone numbers for {domain} university, only search in this domain'},
        # {'title': 'Partnerships', 'query': f'get me all information about Partnerships for {domain} university, only search in this domain'},
        # {'title': 'Awards and Honors', 'query': f'what is the Awards and Honors for {domain} university, only search in this domain'},
        # {'title': 'Study Languages', 'query': f'what is the study languages in its various programs, search only in domain {domain}, for example "English, German, French, etc.'},
        # {'title': 'Student Statistics', 'query': f'how many (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) in this university {domain}, search for evey number of these if found search only in domain'},
        # {'title': 'Accreditation', 'query': f'what is the Accreditation for {domain} university, If not found return "no information found", search only in domain'},
        # {'title': 'Blocked Nationalities', 'query': f'what is the Blocked Nationalities for {domain} university, If not found return "no information found", search only in domain'},
        # {'title': 'University Motto', 'query': f'what is the University Motto for {domain} university, If not found return "no information found", search only in domain'},
        # {'title': 'THE Ranking', 'query': f'What is the significance of {domain} university in Times Higher Education Impact Rankings?, If not found return "no information found", search only in domain'},
        # {'title': 'Year Established', 'query': f'what is the Year Established for {domain} university, If not found return "no information found", search only in domain'},
        # {'title': 'Admission Requirements', 'query': f'what is the Admission Requirements for for differents Degrees in {domain} university, search only in domain'},
        # {'title': 'Scholarships', 'query': f'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by {domain}, search only in domain'},
        # {'title': 'Student Discounts', 'query': f'what are the Discounts provided by {domain} for students,i dont want discounts was provided in the past i need any information abount disounts provided to for exmaple: Sibling Discounts, or international stundens or any kind of discounts, search only in domain'},
        # {'title': 'Notable Alumni', 'query': f'Who are the Notable Alumni of {domain}, list all, If not found return "no information found", search only in domain'},
        # {'title': 'Partner Institutions', 'query': f'Who are the Partner Institutions of {domain}, list all, If not found return "no information found", search only in domain'},
        # {'title': 'Industry Collaborations', 'query': f'list all information about Industry Collaborations of {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Student Satisfaction', 'query': f'what is the Student Satisfaction Ratings and Graduate Employment Rates for {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Early Admission Policy', 'query': f'what is the Early Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Deferred Admission Policy', 'query': f'what is the Deferred Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Transfer Student Policy', 'query': f'what is the Transfer Student Policy for {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Application Process', 'query': f'what is the Application Process Steps for {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Contact Information', 'query': f'what is the contact information (contacts names, Emails, organization phone numbers) for {domain}, search only in domain'},
        # {'title': 'Social Media', 'query': f'what are the social media accounts with links for {domain} (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        # {'title': 'Library Information', 'query': f'List all information about Library Information at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Research Facilities', 'query': f'List all information about Research Facilities at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Housing Options', 'query': f'List all information about Housing Options at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Dining Services', 'query': f'List all information about Dining Services at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Sports Facilities', 'query': f'List all information about Sports and ATHLETIC Facilities like soccer , basketball or any other sports at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Student Organizations', 'query': f'List all information about Student Organizations at {domain}, If not found return "no information found",return student organizations and communities are available to cater to diverse academic, social, and cultural interests? ,search only in domain'},
        # {'title': 'Health Services', 'query': f'List all information about Health Services for students and staff at {domain} university, only search in this domain , search or health facilities not programs'},
        # {'title': 'Career Counseling', 'query': f'List all information about Career Counseling Services at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Disability Services', 'query': f'List all information about Disability Services at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Study Abroad Programs', 'query': f'List all information about Study Abroad and Exchange Students Details at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Campus Security', 'query': f'List all information about Campus Security Services at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Technology Resources', 'query': f'List all information about Technology Resources at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Transportation Services', 'query': f'List all information about Transportation Services at {domain},dont return information about programs i need services provided to students like shuttle bus etc..., If not found return "no information found", search only in domain'},
        # {'title': 'Cultural Centers', 'query': f'List all information about Cultural Centers at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Recreation Facilities', 'query': f'List all information about Recreation Facilities at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Childcare Services', 'query': f'List all information about Childcare Services :(Services provided to children such as students children, do not return any information about programs) at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Financial Services', 'query': f'List all information about Financial Services (services for students to manage finances) at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Religious Services', 'query': f'List all information about Religious/Spiritual Services at {domain}, If not found return "no information found", search only in domain'},
        # {'title': 'Student Support Services', 'query': f'List all information about Student Support Services at {domain}, If not found return "no information found", search only in domain'}
    ]
    
    return [{'title': qt['title'], 'query': qt['query']} for qt in query_templates]

def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text

def check_no_information_found(response_content: str) -> str:
    """
    Check if the response contains 'no information found' phrase.
    If found, return empty string. Otherwise, return the original response.
    """
    if response_content and "no information found" in response_content.lower():
        return ""
    return response_content

def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

def perplexity_call(perplexity_query: str, domain: str, query_title: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        thread_safe_print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}
    
    # Validate domain format before making API call
    if not domain or '.' not in domain or domain.startswith('.') or domain.endswith('.'):
        thread_safe_print(f"API Error: Invalid domain format: {domain}")
        return "Failed to retrieve information - invalid domain format.", {}
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    
    # Base payload (removed return_citations)
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system",
            "content": (
            "You are a specialized web crawler and information extractor. "
            "Focus exclusively on the provided domain. "
            "Answer the user's query concisely and accurately based on the website's content. "
            "Return the answer directly without any additional text before the answer or after the answer. "
            "If you don't have the ability to actively search or crawl websites, return 'no information found'. "
            "If information is not found, clearly return 'no information found'. "
            "Always return the information in English."
        )},
            {"role": "user",
            "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain]
    }
    
    # Add web_search_options for specific queries
    if query_title in HIGH_CONTEXT_QUERIES:
        payload["web_search_options"] = {
            "search_context_size": "high"
        }
        thread_safe_print(f"  → Using HIGH context size for '{query_title}'")
    else:
        thread_safe_print(f"  → Using default context size for '{query_title}'")
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        thread_safe_print(f"API Request Error for {domain}: {e}")
        return f"Failed to retrieve information due to API error: {e}", {}
    
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    
    return content, usage

def process_single_domain(domain: str, domain_index: int, total_domains: int) -> dict:
    """Process a single domain and return results."""
    thread_id = threading.current_thread().name
    thread_safe_print(f"\n[Thread {thread_id}] STARTING DOMAIN [{domain_index}/{total_domains}]: {domain}")
    
    try:
        # Generate dynamic filename for the current domain in subdirectory
        output_filename = os.path.join(OUTPUT_FOLDER, f"{domain}_data.json")
        queries = generate_queries_for_domain(domain)
        
        # Initialize cost tracker and data storage for the current domain
        domain_total_cost = 0.0
        data_dict = {}
        
        total_queries = len(queries)
        for i, item in enumerate(queries):
            title = item['title']
            query = item['query']
            
            thread_safe_print(f"[Thread {thread_id}] [{i+1}/{total_queries}] Processing '{title}' for {domain}...")
            
            response_content, usage = perplexity_call(
                perplexity_query=query,
                domain=domain,
                query_title=title
            )
            
            if usage:
                domain_total_cost += calc_sonar_pro_cost(usage)
            
            if response_content:
                response_content = clean_citation_references(response_content)
                # Check for "no information found" and return empty string if found
                response_content = check_no_information_found(response_content)
            else:
                response_content = "Failed to retrieve information for this query."
            
            # Add to data dictionary with title as key and response as value
            data_dict[title] = response_content
            
            thread_safe_print(f"  [Thread {thread_id}] Finished: '{title}'. Data collected.")
            time.sleep(1)  # Be polite to the API

        # Save to JSON file
        with open(output_filename, 'w', encoding='utf-8') as json_file:
            json.dump(data_dict, json_file, indent=2, ensure_ascii=False)

        # Return results for this domain
        result = {
            'domain': domain,
            'domain_index': domain_index,
            'output_filename': output_filename,
            'total_entries': len(data_dict),
            'domain_cost': domain_total_cost,
            'thread_id': thread_id
        }
        
        thread_safe_print(f"[Thread {thread_id}] COMPLETED DOMAIN: {domain} | Entries: {len(data_dict)} | Cost: ${domain_total_cost:.4f}")
        return result
        
    except Exception as e:
        thread_safe_print(f"[Thread {thread_id}] ERROR processing {domain}: {e}")
        return {
            'domain': domain,
            'domain_index': domain_index,
            'error': str(e),
            'thread_id': thread_id
        }

def process_domains_in_parallel(domains_batch: list, batch_start_index: int) -> list:
    """Process a batch of domains in parallel using ThreadPoolExecutor."""
    thread_safe_print(f"\n{'='*80}")
    thread_safe_print(f"STARTING PARALLEL BATCH: {len(domains_batch)} domains")
    thread_safe_print(f"Domains: {', '.join(domains_batch[:5])}{'...' if len(domains_batch) > 5 else ''}")
    thread_safe_print(f"{'='*80}")
    
    results = []
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_domain = {}
        for i, domain in enumerate(domains_batch):
            domain_index = batch_start_index + i + 1
            future = executor.submit(process_single_domain, domain, domain_index, len(domains_batch))
            future_to_domain[future] = domain
        
        # Collect results as they complete
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                thread_safe_print(f"ERROR: Exception occurred for domain {domain}: {e}")
                results.append({
                    'domain': domain,
                    'error': str(e)
                })
    
    return results

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Process the provided domains directly
    target_domains = TARGET_DOMAINS
    
    if not target_domains:
        print("No valid domains found in TARGET_DOMAINS list. Exiting.")
        return

    # Create output folder if it doesn't exist
    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)
    print(f"Output folder '{OUTPUT_FOLDER}' created/verified")

    print(f"\nHIGH CONTEXT QUERIES: {', '.join(HIGH_CONTEXT_QUERIES)}")
    print(f"Total queries with high context: {len(HIGH_CONTEXT_QUERIES)}")
    print(f"\nStarting parallel processing with {MAX_WORKERS} threads")
    print(f"Processing {len(target_domains)} domains in batches of {BATCH_SIZE}")

    # Initialize overall cost tracker
    total_overall_cost = 0.0
    total_entries_collected = 0
    successful_domains = 0
    failed_domains = 0
    
    # Process domains in batches
    for batch_start in range(0, len(target_domains), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(target_domains))
        domains_batch = target_domains[batch_start:batch_end]
        
        batch_number = (batch_start // BATCH_SIZE) + 1
        total_batches = (len(target_domains) + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\n{'*'*80}")
        print(f"PROCESSING BATCH {batch_number}/{total_batches}")
        print(f"Domains {batch_start + 1} to {batch_end} of {len(target_domains)}")
        print(f"{'*'*80}")
        
        # Process this batch in parallel
        batch_results = process_domains_in_parallel(domains_batch, batch_start)
        
        # Aggregate results from this batch
        batch_cost = 0.0
        batch_entries = 0
        
        for result in batch_results:
            if 'error' not in result:
                batch_cost += result.get('domain_cost', 0)
                batch_entries += result.get('total_entries', 0)
                successful_domains += 1
                print(f"✓ {result['domain']}: {result['total_entries']} entries, ${result['domain_cost']:.4f}")
            else:
                failed_domains += 1
                print(f"✗ {result['domain']}: ERROR - {result.get('error', 'Unknown error')}")
        
        total_overall_cost += batch_cost
        total_entries_collected += batch_entries
        
        print(f"\nBatch {batch_number} Summary:")
        print(f"  Batch Cost: ${batch_cost:.4f}")
        print(f"  Batch Entries: {batch_entries}")
        print(f"  Running Total Cost: ${total_overall_cost:.4f}")
        print(f"  Running Total Entries: {total_entries_collected}")

    print("\n" + "="*80)
    print("ALL DOMAINS PROCESSING COMPLETED!")
    print("="*80)
    print(f"Total domains processed: {len(target_domains)}")
    print(f"Successful domains: {successful_domains}")
    print(f"Failed domains: {failed_domains}")
    print(f"Total entries collected: {total_entries_collected}")
    print(f"Total estimated cost: ${total_overall_cost:.4f}")
    print(f"All JSON files saved in: {OUTPUT_FOLDER}")
    print("="*80)

# Run the main function
if __name__ == "__main__":
    main()

Output folder 'universities_informations' created/verified

HIGH CONTEXT QUERIES: Contact Information, Application Process, Accreditation, Study Languages, Scholarships, Admission Requirements, Faculties and Departments, Locations and Addresses, Transfer Student Policy
Total queries with high context: 9

Starting parallel processing with 5 threads
Processing 1 domains in batches of 10

********************************************************************************
PROCESSING BATCH 1/1
Domains 1 to 1 of 1
********************************************************************************

STARTING PARALLEL BATCH: 1 domains
Domains: kcgu.ac.kr

[Thread ThreadPoolExecutor-0_0] STARTING DOMAIN [1/1]: kcgu.ac.kr
[Thread ThreadPoolExecutor-0_0] [1/6] Processing 'About Us' for kcgu.ac.kr...
  → Using default context size for 'About Us'
  [Thread ThreadPoolExecutor-0_0] Finished: 'About Us'. Data collected.
[Thread ThreadPoolExecutor-0_0] [2/6] Processing 'Vision' for kcgu.ac.kr...
  → Using def

## Main Function - save excel file - web_search_options paramter=high - Threadpool and read from excell file


In [None]:
import requests
import os
import re
import time
import pandas as pd
import tldextract
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Excel file name and output folder
EXCEL_FILE = "250_Uni_Links.xlsx"  # Change this to your Excel file name
OUTPUT_FOLDER = "250 universities informations"

# Parallel processing configuration
MAX_WORKERS = 5  # Number of parallel threads (reduced for stability)
BATCH_SIZE = 10   # Number of domains to process in each batch

# Define which queries should use high context size
HIGH_CONTEXT_QUERIES = {
    'Faculties and Departments',
    'Scholarships',
    'Accreditation',
    'Locations and Addresses',
    'Admission Requirements',
    'Contact Information',
    'Application Process',
    'Study Languages',
    'Transfer Student Policy'
}

# Thread-safe lock for cost tracking and console output
cost_lock = Lock()
print_lock = Lock()

def thread_safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)

def extract_domains_from_excel(excel_file: str) -> list:
    """Extract domains from the Link column in Excel file."""
    try:
        # Read the Excel file
        df = pd.read_excel(excel_file)
        
        # Clean column names by stripping whitespace
        df.columns = df.columns.str.strip()
        
        # Check if 'Link' column exists
        if 'Link' not in df.columns:
            print("Error: 'Link' column not found in Excel file")
            print(f"Available columns: {list(df.columns)}")
            return []
        
        domains = []
        duplicate_tracker = {}  # Track duplicates with their row numbers
        empty_links = 0
        invalid_domains = 0
        error_links = 0
        
        print("\nProcessing domains from Excel file...")
        for index, row in df.iterrows():
            link = row['Link']
            if pd.notna(link) and str(link).strip():  # Check if link is not NaN and not empty
                try:
                    link_str = str(link).strip()
                    # Extract domain components using tldextract
                    ext = tldextract.extract(link_str)
                    # Get the registered domain (without subdomain) - LOWERCASE like example script
                    domain = f"{ext.domain}.{ext.suffix}".lower()
                    
                    # Validate domain more strictly
                    if (domain and domain != "." and ext.domain and ext.suffix and 
                        len(ext.domain) > 0 and len(ext.suffix) > 0 and 
                        '.' in domain and not domain.startswith('.') and not domain.endswith('.')):
                        
                        # Track duplicates
                        if domain in duplicate_tracker:
                            duplicate_tracker[domain].append(index + 1)
                        else:
                            duplicate_tracker[domain] = [index + 1]
                        
                        domains.append(domain)
                        if index < 5:  # Show first 5 for debugging
                            print(f"  Row {index+1}: {link_str} -> {domain}")
                    else:
                        invalid_domains += 1
                        if invalid_domains <= 3:  # Show first 3 invalid domains
                            print(f"  Invalid domain at row {index+1}: {link_str} -> {domain}")
                except Exception as e:
                    error_links += 1
                    if error_links <= 3:  # Show first 3 errors
                        print(f"  Error extracting domain from row {index+1} ({link}): {e}")
                    continue
            else:
                empty_links += 1
        
        # Find and print duplicate domains
        duplicates = {domain: rows for domain, rows in duplicate_tracker.items() if len(rows) > 1}
        
        # Remove duplicates while preserving order
        original_count = len(domains)
        unique_domains = list(dict.fromkeys(domains))
        duplicates_removed = original_count - len(unique_domains)
        
        print(f"\nDomain extraction summary:")
        print(f"  Total entries in Excel: {len(df)}")
        print(f"  Empty/NaN links: {empty_links}")
        print(f"  Invalid domains: {invalid_domains}")
        print(f"  Errors during extraction: {error_links}")
        print(f"  Valid domains found: {original_count}")
        print(f"  Duplicate domains removed: {duplicates_removed}")
        print(f"  Final unique domains: {len(unique_domains)}")
        
        # Print duplicate domains
        if duplicates:
            print(f"\n📋 DUPLICATE DOMAINS FOUND ({len(duplicates)} duplicates):")
            for domain, rows in duplicates.items():
                print(f"  🔄 {domain} (appears in rows: {', '.join(map(str, rows))})")
        else:
            print(f"\n✅ No duplicate domains found")
        
        return unique_domains
        
    except FileNotFoundError:
        print(f"Error: Excel file '{excel_file}' not found")
        return []
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return []

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {'title': 'About Us', 'query': f'get me the information about (About Us information) for {domain} university, We do not need the Mission and Vision, Educational Philosophy, or Campuses and Infrastructure sections. We only need the About Us section, if found about us section copy it exactly only search in this domain'},
        {'title': 'Vision', 'query': f'for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': f'for {domain} university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': f'get me all information about (all location, return all location titles with its address) for {domain} university, only search in this domain'},
        {'title': 'Degrees', 'query': f'get me all information about (Degrees) avalible for {domain} university like (Bachelor, Master, PhD, etc.), only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': f'get me all phone numbers for {domain} university, only search in this domain'},
        {'title': 'Partnerships', 'query': f'get me all information about Partnerships for {domain} university, only search in this domain'},
        {'title': 'Awards and Honors', 'query': f'what is the Awards and Honors for {domain} university, only search in this domain'},
        {'title': 'Study Languages', 'query': f'what is the study languages in its various programs, search only in domain {domain}, for example "English, German, French, etc.'},
        {'title': 'Student Statistics', 'query': f'how many (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) in this university {domain}, search for evey number of these if found search only in domain'},
        {'title': 'Accreditation', 'query': f'what is the Accreditation for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Blocked Nationalities', 'query': f'what is the Blocked Nationalities for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'University Motto', 'query': f'what is the University Motto for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'THE Ranking', 'query': f'What is the significance of {domain} university in Times Higher Education Impact Rankings?, If not found return "no information found", search only in domain'},
        {'title': 'Year Established', 'query': f'what is the Year Established for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Admission Requirements', 'query': f'what is the Admission Requirements for for differents Degrees in {domain} university, search only in domain'},
        {'title': 'Scholarships', 'query': f'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by {domain}, search only in domain'},
        {'title': 'Student Discounts', 'query': f'what are the Discounts provided by {domain} for students,i dont want discounts was provided in the past i need any information abount disounts provided to for exmaple: Sibling Discounts, or international stundens or any kind of discounts, search only in domain'},
        {'title': 'Notable Alumni', 'query': f'Who are the Notable Alumni of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Partner Institutions', 'query': f'Who are the Partner Institutions of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Industry Collaborations', 'query': f'list all information about Industry Collaborations of {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Satisfaction', 'query': f'what is the Student Satisfaction Ratings and Graduate Employment Rates for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Early Admission Policy', 'query': f'what is the Early Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Deferred Admission Policy', 'query': f'what is the Deferred Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transfer Student Policy', 'query': f'what is the Transfer Student Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Application Process', 'query': f'what is the Application Process Steps for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Contact Information', 'query': f'what is the contact information (contacts names, Emails, organization phone numbers) for {domain}, search only in domain'},
        {'title': 'Social Media', 'query': f'what are the social media accounts with links for {domain} (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        {'title': 'Library Information', 'query': f'List all information about Library Information at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Research Facilities', 'query': f'List all information about Research Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Housing Options', 'query': f'List all information about Housing Options at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Dining Services', 'query': f'List all information about Dining Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Sports Facilities', 'query': f'List all information about Sports and ATHLETIC Facilities like soccer , basketball or any other sports at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Organizations', 'query': f'List all information about Student Organizations at {domain}, If not found return "no information found",return student organizations and communities are available to cater to diverse academic, social, and cultural interests? ,search only in domain'},
        {'title': 'Health Services', 'query': f'List all information about Health Services for students and staff at {domain} university, only search in this domain , search or health facilities not programs'},
        {'title': 'Career Counseling', 'query': f'List all information about Career Counseling Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Disability Services', 'query': f'List all information about Disability Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Study Abroad Programs', 'query': f'List all information about Study Abroad and Exchange Students Details at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Campus Security', 'query': f'List all information about Campus Security Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Technology Resources', 'query': f'List all information about Technology Resources at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transportation Services', 'query': f'List all information about Transportation Services at {domain},dont return information about programs i need services provided to students like shuttle bus etc..., If not found return "no information found", search only in domain'},
        {'title': 'Cultural Centers', 'query': f'List all information about Cultural Centers at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Recreation Facilities', 'query': f'List all information about Recreation Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Childcare Services', 'query': f'List all information about Childcare Services :(Services provided to children such as students children, do not return any information about programs) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Financial Services', 'query': f'List all information about Financial Services (services for students to manage finances) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Religious Services', 'query': f'List all information about Religious/Spiritual Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Support Services', 'query': f'List all information about Student Support Services at {domain}, If not found return "no information found", search only in domain'}
    ]
    
    return [{'title': qt['title'], 'query': qt['query']} for qt in query_templates]

def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text

def check_no_information_found(response_content: str) -> str:
    """
    Check if the response contains 'no information found' phrase.
    If found, return empty string. Otherwise, return the original response.
    """
    if response_content and "no information found" in response_content.lower():
        return ""
    return response_content

def format_citations(citations: list) -> str:
    """Format citations list into a single string."""
    if not citations:
        return "No citations provided"
    
    formatted_citations = []
    for i, citation in enumerate(citations, 1):
        formatted_citations.append(f"{i}. {citation}")
    
    return "\n".join(formatted_citations)

def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

def perplexity_call(perplexity_query: str, domain: str, query_title: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        thread_safe_print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    
    # Validate domain format before making API call
    if not domain or '.' not in domain or domain.startswith('.') or domain.endswith('.'):
        thread_safe_print(f"API Error: Invalid domain format: {domain}")
        return "Failed to retrieve information - invalid domain format.", {}, []
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    
    # Base payload
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system",
            "content": (
            "You are a specialized web crawler and information extractor. "
            "Focus exclusively on the provided domain. "
            "Answer the user's query concisely and accurately based on the website's content. "
            "Return the answer directly without any additional text before the answer or after the answer. "
            "If you don't have the ability to actively search or crawl websites, return 'no information found'. "
            "If information is not found, clearly return 'no information found'. "
            "Always return the information in English."
        )},
            {"role": "user",
            "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True
    }
    
    # Add web_search_options for specific queries
    if query_title in HIGH_CONTEXT_QUERIES:
        payload["web_search_options"] = {
            "search_context_size": "high"
        }
        thread_safe_print(f"  → Using HIGH context size for '{query_title}'")
    else:
        thread_safe_print(f"  → Using default context size for '{query_title}'")
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        thread_safe_print(f"API Request Error for {domain}: {e}")
        return f"Failed to retrieve information due to API error: {e}", {}, []
    
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    
    return content, usage, citations

def process_single_domain(domain: str, domain_index: int, total_domains: int) -> dict:
    """Process a single domain and return results."""
    thread_id = threading.current_thread().name
    thread_safe_print(f"\n[Thread {thread_id}] STARTING DOMAIN [{domain_index}/{total_domains}]: {domain}")
    
    try:
        # Generate dynamic filename for the current domain in subdirectory
        output_filename = os.path.join(OUTPUT_FOLDER, f"{domain}_data.xlsx")
        queries = generate_queries_for_domain(domain)
        
        # Initialize cost tracker and data storage for the current domain
        domain_total_cost = 0.0
        data_rows = []
        
        total_queries = len(queries)
        for i, item in enumerate(queries):
            title = item['title']
            query = item['query']
            
            thread_safe_print(f"[Thread {thread_id}] [{i+1}/{total_queries}] Processing '{title}' for {domain}...")
            
            response_content, usage, citations = perplexity_call(
                perplexity_query=query,
                domain=domain,
                query_title=title
            )
            
            if usage:
                domain_total_cost += calc_sonar_pro_cost(usage)
            
            if response_content:
                response_content = clean_citation_references(response_content)
                # Check for "no information found" and return empty string if found
                response_content = check_no_information_found(response_content)
            else:
                response_content = "Failed to retrieve information for this query."
            
            # Format citations for Excel
            formatted_citations = format_citations(citations)
            
            # Add row to data
            data_rows.append({
                'Title': title,
                'Response Content': response_content,
                'Citations': formatted_citations
            })
            
            thread_safe_print(f"  [Thread {thread_id}] Finished: '{title}'. Data collected.")
            time.sleep(1)  # Be polite to the API

        # Create DataFrame and save to Excel
        df = pd.DataFrame(data_rows)
        
        # Save to Excel with formatting (using original naming convention)
        with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name=f'{domain}_data', index=False)
            
            # Get the workbook and worksheet for formatting
            workbook = writer.book
            worksheet = writer.sheets[f'{domain}_data']
            
            # Auto-adjust column widths
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                
                # Set a reasonable maximum width
                adjusted_width = min(max_length + 2, 50)
                worksheet.column_dimensions[column_letter].width = adjusted_width
            
            # Enable text wrapping for all cells
            from openpyxl.styles import Alignment
            for row in worksheet.iter_rows():
                for cell in row:
                    cell.alignment = Alignment(wrap_text=True, vertical='top')

        # Return results for this domain
        result = {
            'domain': domain,
            'domain_index': domain_index,
            'output_filename': output_filename,
            'total_rows': len(data_rows),
            'domain_cost': domain_total_cost,
            'thread_id': thread_id
        }
        
        thread_safe_print(f"[Thread {thread_id}] COMPLETED DOMAIN: {domain} | Rows: {len(data_rows)} | Cost: ${domain_total_cost:.4f}")
        return result
        
    except Exception as e:
        thread_safe_print(f"[Thread {thread_id}] ERROR processing {domain}: {e}")
        return {
            'domain': domain,
            'domain_index': domain_index,
            'error': str(e),
            'thread_id': thread_id
        }

def process_domains_in_parallel(domains_batch: list, batch_start_index: int) -> list:
    """Process a batch of domains in parallel using ThreadPoolExecutor."""
    thread_safe_print(f"\n{'='*80}")
    thread_safe_print(f"STARTING PARALLEL BATCH: {len(domains_batch)} domains")
    thread_safe_print(f"Domains: {', '.join(domains_batch[:5])}{'...' if len(domains_batch) > 5 else ''}")
    thread_safe_print(f"{'='*80}")
    
    results = []
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_domain = {}
        for i, domain in enumerate(domains_batch):
            domain_index = batch_start_index + i + 1
            future = executor.submit(process_single_domain, domain, domain_index, len(domains_batch))
            future_to_domain[future] = domain
        
        # Collect results as they complete
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                thread_safe_print(f"ERROR: Exception occurred for domain {domain}: {e}")
                results.append({
                    'domain': domain,
                    'error': str(e)
                })
    
    return results

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Extract domains from Excel file
    target_domains = extract_domains_from_excel(EXCEL_FILE)
    
    if not target_domains:
        print("No domains found in Excel file. Exiting.")
        return

    # Create output folder if it doesn't exist
    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)
    print(f"Output folder '{OUTPUT_FOLDER}' created/verified")

    print(f"\nHIGH CONTEXT QUERIES: {', '.join(HIGH_CONTEXT_QUERIES)}")
    print(f"Total queries with high context: {len(HIGH_CONTEXT_QUERIES)}")
    print(f"\nStarting parallel processing with {MAX_WORKERS} threads")
    print(f"Processing {len(target_domains)} domains in batches of {BATCH_SIZE}")

    # Initialize overall cost tracker
    total_overall_cost = 0.0
    total_rows_collected = 0
    successful_domains = 0
    failed_domains = 0
    
    # Process domains in batches
    for batch_start in range(0, len(target_domains), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(target_domains))
        domains_batch = target_domains[batch_start:batch_end]
        
        batch_number = (batch_start // BATCH_SIZE) + 1
        total_batches = (len(target_domains) + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\n{'*'*80}")
        print(f"PROCESSING BATCH {batch_number}/{total_batches}")
        print(f"Domains {batch_start + 1} to {batch_end} of {len(target_domains)}")
        print(f"{'*'*80}")
        
        # Process this batch in parallel
        batch_results = process_domains_in_parallel(domains_batch, batch_start)
        
        # Aggregate results from this batch
        batch_cost = 0.0
        batch_rows = 0
        
        for result in batch_results:
            if 'error' not in result:
                batch_cost += result.get('domain_cost', 0)
                batch_rows += result.get('total_rows', 0)
                successful_domains += 1
                print(f"✓ {result['domain']}: {result['total_rows']} rows, ${result['domain_cost']:.4f}")
            else:
                failed_domains += 1
                print(f"✗ {result['domain']}: ERROR - {result.get('error', 'Unknown error')}")
        
        total_overall_cost += batch_cost
        total_rows_collected += batch_rows
        
        print(f"\nBatch {batch_number} Summary:")
        print(f"  Batch Cost: ${batch_cost:.4f}")
        print(f"  Batch Rows: {batch_rows}")
        print(f"  Running Total Cost: ${total_overall_cost:.4f}")
        print(f"  Running Total Rows: {total_rows_collected}")

    print("\n" + "="*80)
    print("ALL DOMAINS PROCESSING COMPLETED!")
    print("="*80)
    print(f"Total domains processed: {len(target_domains)}")
    print(f"Successful domains: {successful_domains}")
    print(f"Failed domains: {failed_domains}")
    print(f"Total rows collected: {total_rows_collected}")
    print(f"Total estimated cost: ${total_overall_cost:.4f}")
    print(f"All Excel files saved in: {OUTPUT_FOLDER}")
    print("="*80)

# Run the main function
if __name__ == "__main__":
    main()


Processing domains from Excel file...
  Row 1: https://international.sisli.edu.tr/ -> sisli.edu.tr
  Row 2: https://www.newhaven.edu/index.php -> newhaven.edu
  Row 3: https://www.khas.edu.tr/en/ -> khas.edu.tr
  Row 4: https://www.istinye.edu.tr/en -> istinye.edu.tr
  Row 5: https://www.beykent.edu.tr/ -> beykent.edu.tr

Domain extraction summary:
  Total entries in Excel: 249
  Empty/NaN links: 0
  Invalid domains: 0
  Errors during extraction: 0
  Valid domains found: 249
  Duplicate domains removed: 24
  Final unique domains: 225

📋 DUPLICATE DOMAINS FOUND (22 duplicates):
  🔄 newhaven.edu (appears in rows: 2, 115)
  🔄 beykent.edu.tr (appears in rows: 5, 157)
  🔄 acibadem.edu.tr (appears in rows: 14, 183)
  🔄 ecpi.edu (appears in rows: 15, 111)
  🔄 yeniyuzyil.edu.tr (appears in rows: 18, 142, 154)
  🔄 bauinternational.edu.ge (appears in rows: 19, 58)
  🔄 okan.edu.tr (appears in rows: 30, 193)
  🔄 relay.edu (appears in rows: 35, 124)
  🔄 bryantstratton.edu (appears in rows: 40, 114

# New-Perplexity Search API

In [None]:
! pip install perplexityai

In [None]:
!setx PERPLEXITY_API_KEY "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"


SUCCESS: Specified value was saved.


In [94]:

load_dotenv()

client = Perplexity()

# Set your target domain(s)
domain = ["univ-amu.fr"]

# Ask a general university-level question
query = f'get me all phone numbers for {domain} university, only search in this domain'

response = client.chat.completions.create(
    model="sonar",  # or "sonar-pro" for better quality
    messages=[
        {
            "role": "system", 
            "content": (
            "You are a specialized web crawler and information extractor . "
            f"Focus exclusively on the provided {domain}. "
            "Answer the user's query concisely and accurately based on the website's content. "
            "Return the answer directly without any additional text before the answer or after the answer. "
            "If you don't get the answer from domain search in general about it "
            "If information is not found, clearly return 'no information found'. "
            "Always return the information in English."
        )
        },
        {
            "role": "user",
            "content": query
        }
    ],
    #search_domain_filter=domain,
    max_tokens=4000,
)
print("full response:")
print(response)
# ---- Format the output ----
print("\n=== response ===\n")
print(response.choices[0].message.content.strip())

print("\n=== Citations ===")
for url in response.citations:
    print(f"- {url}")

print("\n=== Search Results ===")
for i, r in enumerate(response.search_results, 1):
    snippet = r.snippet.replace("\n\n", " ").strip()
    print(f"{i}. {r.title}")
    print(f"   URL: {r.url}")
    if r.date:
        print(f"   Date: {r.date}")
    if r.last_updated:
        print(f"   Last Updated: {r.last_updated}")
    print(f"   Snippet: {snippet}\n")

# ---- Show cost details ----
print("\n=== Cost Info ===")
print(f"Total Cost: ${response.usage.cost.total_cost:.4f}")
print(f"  • Input tokens cost: ${response.usage.cost.input_tokens_cost}")
print(f"  • Output tokens cost: ${response.usage.cost.output_tokens_cost}")
print(f"  • Request cost: ${response.usage.cost.request_cost}")
if response.usage.cost.citation_tokens_cost is not None:
    print(f"  • Citation tokens cost: ${response.usage.cost.citation_tokens_cost}")
if response.usage.cost.search_queries_cost is not None:
    print(f"  • Search queries cost: ${response.usage.cost.search_queries_cost}")



full response:
CompletionCreateResponse(id='8a8aeb60-b09a-4f7f-8b64-c0966cb9be3f', choices=[Choice(delta=ChatMessageOutput(content='', role='assistant', reasoning_steps=None, tool_calls=None), index=0, message=ChatMessageOutput(content='The phone numbers for Aix-Marseille University (univ-amu.fr) are as follows:\n\n- General university phone number: +33 4 91 39 65 00  \n- Alternative university phone number: +33 4 91 82 90 00  \n- Institut des Sciences Moléculaires de Marseille contact (Thierry Constantieux): +33 4 13 94 56 62  \n\nThese numbers come directly from the official Aix-Marseille University domain or affiliated pages within the univ-amu.fr domain[1][2][3].', role='assistant', reasoning_steps=None, tool_calls=None), finish_reason='stop')], created=1761136809, model='sonar', usage=UsageInfo(completion_tokens=120, cost=Cost(input_tokens_cost=0.0, output_tokens_cost=0.0, total_cost=0.005, citation_tokens_cost=None, reasoning_tokens_cost=None, request_cost=0.005, search_queries_c

## save in excell 

In [50]:
import os
import re
import time
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading
import logging
from dotenv import load_dotenv
from perplexity import Perplexity
import pandas as pd

# Load environment variables from .env file
load_dotenv()

# --- Configuration ---
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Debug: Check if API key is loaded
if PERPLEXITY_API_KEY:
    print(f"✓ PERPLEXITY_API_KEY loaded successfully (starts with: {PERPLEXITY_API_KEY[:8]}...)")
else:
    print("✗ PERPLEXITY_API_KEY not found in environment variables")

# Target domains to process
TARGET_DOMAINS = ["ox.ac.uk","harvard.edu"]

# Output folder (can be overridden by environment variables)
OUTPUT_FOLDER = os.getenv("OUTPUT_FOLDER", "universities_informations")

# Parallel processing configuration
MAX_WORKERS = 30
BATCH_SIZE = 30

# Queries that should use sonar-pro model
SONAR_PRO_QUERIES = { 
    'Faculties and Departments',
    'Scholarships',
    'Accreditation',
    'Locations and Addresses',
    'Admission Requirements',
    'Contact Information',
    'Application Process',
    'Study Languages',
    'Transfer Student Policy'
}

# Thread-safe lock
cost_lock = Lock()
logger_lock = Lock()

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(threadName)s] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('perplexity_search_excel.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def thread_safe_log(level, message, *args):
    """Thread-safe logging function."""
    with logger_lock:
        if level == 'info':
            logger.info(message, *args)
        elif level == 'error':
            logger.error(message, *args)
        elif level == 'warning':
            logger.warning(message, *args)

def get_target_domains() -> list:
    """Return the list of target domains to process."""
    logger.info(f"\nUsing predefined target domains:")
    for i, domain in enumerate(TARGET_DOMAINS):
        logger.info(f"  {i+1}. {domain}")
    
    logger.info(f"\nTotal domains to process: {len(TARGET_DOMAINS)}")
    return TARGET_DOMAINS

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    query_templates = [
        {'title': 'About Us', 'query': 'what is the (About Us) information for this university , dont retruen any other information about mession or vesion or any faculty'},
        {'title': 'Vision', 'query': 'for this university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': 'for this university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': 'get me all information about (all location, return all location titles with its address) for this university, only search in this domain'},
        {'title': 'Degrees', 'query': 'get me all information about (Degrees) avalible for this university like (Bachelor, Master, PhD, etc.), only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': 'get me all phone numbers for this university, only search in this domain'},
        {'title': 'Partnerships', 'query': 'get me all information about Partnerships for this university, only search in this domain'},
        {'title': 'Awards and Honors', 'query': 'what is the Awards and Honors for this university, only search in this domain'},
        {'title': 'Study Languages', 'query': 'what is the study languages in its various programs, search only in this domain, for example "English, German, French, etc.'},
        {'title': 'Student Statistics', 'query': 'how many (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) in this university, search for evey number of these if found search only in domain'},
        {'title': 'Accreditation', 'query': 'what is the Accreditation for this university, If not found return "no information found", search only in domain'},
        {'title': 'Blocked Nationalities', 'query': 'what is the Blocked Nationalities for this university, If not found return "no information found", search only in domain'},
        {'title': 'University Motto', 'query': 'what is the University Motto for this university, If not found return "no information found", search only in domain'},
        {'title': 'THE Ranking', 'query': 'What is the significance of this university in Times Higher Education Impact Rankings?, If not found return "no information found", search only in domain'},
        {'title': 'Year Established', 'query': 'what is the Year Established for this university, If not found return "no information found", search only in domain'},
        {'title': 'Admission Requirements', 'query': 'what is the Admission Requirements for for differents Degrees in this university, search only in domain'},
        {'title': 'Scholarships', 'query': 'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by this university, search only in domain'},
        {'title': 'Student Discounts', 'query': 'what are the Discounts provided by this university for students,i dont want discounts was provided in the past i need any information abount disounts provided to for exmaple: Sibling Discounts, or international stundens or any kind of discounts, search only in domain'},
        {'title': 'Notable Alumni', 'query': 'Who are the Notable Alumni of this university, list all, If not found return "no information found", search only in domain'},
        {'title': 'Partner Institutions', 'query': 'Who are the Partner Institutions of this university, list all, If not found return "no information found", search only in domain'},
        {'title': 'Industry Collaborations', 'query': 'list all information about Industry Collaborations of this university, If not found return "no information found", search only in domain'},
        {'title': 'Student Satisfaction', 'query': 'what is the Student Satisfaction Ratings and Graduate Employment Rates for this university, If not found return "no information found", search only in domain'},
        {'title': 'Early Admission Policy', 'query': 'what is the Early Admission Policy for this university, If not found return "no information found", search only in domain'},
        {'title': 'Deferred Admission Policy', 'query': 'what is the Deferred Admission Policy for this university, If not found return "no information found", search only in domain'},
        {'title': 'Transfer Student Policy', 'query': 'what is the Transfer Student Policy for this university, If not found return "no information found", search only in domain'},
        {'title': 'Application Process', 'query': 'what is the Application Process Steps for this university, If not found return "no information found", search only in domain'},
        {'title': 'Contact Information', 'query': 'what is the contact information (contacts names, Emails, organization phone numbers) for this university, search only in domain'},
        {'title': 'Social Media', 'query': 'what are the social media accounts with links for this university (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        {'title': 'Library Information', 'query': 'List all information about Library Information at this university, If not found return "no information found", search only in domain'},
        {'title': 'Research Facilities', 'query': 'List all information about Research Facilities at this university, If not found return "no information found", search only in domain'},
        {'title': 'Housing Options', 'query': 'List all information about Housing Options at this university, If not found return "no information found", search only in domain'},
        {'title': 'Dining Services', 'query': 'List all information about Dining Services at this university, If not found return "no information found", search only in domain'},
        {'title': 'Sports Facilities', 'query': 'List all information about Sports and ATHLETIC Facilities like soccer , basketball or any other sports at this university, If not found return "no information found", search only in domain'},
        {'title': 'Student Organizations', 'query': 'List all information about Student Organizations at this university, If not found return "no information found",return student organizations and communities are available to cater to diverse academic, social, and cultural interests? ,search only in domain'},
        {'title': 'Health Services', 'query': 'List all information about Health Services for students and staff at this university, only search in this domain , search or health facilities not programs'},
        {'title': 'Career Counseling', 'query': 'List all information about Career Counseling Services at this university, If not found return "no information found", search only in domain'},
        {'title': 'Disability Services', 'query': 'List all information about Disability Services at this university, If not found return "no information found", search only in domain'},
        {'title': 'Study Abroad Programs', 'query': 'List all information about Study Abroad and Exchange Students Details at this university, If not found return "no information found", search only in domain'},
        {'title': 'Campus Security', 'query': 'List all information about Campus Security Services at this university, If not found return "no information found", search only in domain'},
        {'title': 'Technology Resources', 'query': 'List all information about Technology Resources at this university, If not found return "no information found", search only in domain'},
        {'title': 'Transportation Services', 'query': 'List all information about Transportation Services at this university,dont return information about programs i need services provided to students like shuttle bus etc..., If not found return "no information found", search only in domain'},
        {'title': 'Cultural Centers', 'query': 'List all information about Cultural Centers at this university, If not found return "no information found", search only in domain'},
        {'title': 'Recreation Facilities', 'query': 'List all information about Recreation Facilities at this university, If not found return "no information found", search only in domain'},
        {'title': 'Childcare Services', 'query': 'List all information about Childcare Services :(Services provided to children such as students children, do not return any information about programs) at this university, If not found return "no information found", search only in domain'},
        {'title': 'Financial Services', 'query': 'List all information about Financial Services (services for students to manage finances) at this university, If not found return "no information found", search only in domain'},
        {'title': 'Religious Services', 'query': 'List all information about Religious/Spiritual Services at this university, If not found return "no information found", search only in domain'},
        {'title': 'Student Support Services', 'query': 'List all information about Student Support Services at this university, If not found return "no information found", search only in domain'}
    ]
    
    return query_templates


def perplexity_chat_completions_client(query: str, domain: str, model: str = "sonar") -> dict:
    """Calls the new Perplexity API chat completions with domain filtering."""
    if not PERPLEXITY_API_KEY:
        thread_safe_log('error', "API Error: PERPLEXITY_API_KEY environment variable not set.")
        return {}
    
    try:
        client = Perplexity()
        
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system",
                 "content": (
                "You are a specialized web crawler and information extractor. "
                "Focus exclusively on the provided domain. "
                "Answer the user's query concisely and accurately based on the website's content. "
                "Return the answer directly without any additional text before the answer or after the answer. "
                "If you don't have the ability to actively search or crawl websites, return 'no information found'. "
                "If information is not found, clearly return 'no information found'. "
                "Always return the information in English."
                )
                },
                {"role": "user",
                 "content": query}
            ],
            search_domain_filter=[domain],
            max_tokens=2000
        )
        
        return {
            'content': response.choices[0].message.content.strip() if response.choices else "",
            'citations': response.citations if hasattr(response, 'citations') else [],
            'search_results': response.search_results if hasattr(response, 'search_results') else [],
            'usage': response.usage if hasattr(response, 'usage') else None
        }
        
    except Exception as e:
        thread_safe_log('error', f"API Error for {domain}: {e}")
        return {}

def format_citations(citations: list) -> str:
    """Format citations into a single line string."""
    if not citations or len(citations) == 0:
        return "No citations"
    
    # Extract URLs from citations and join them with newlines (for Excel multi-line cell)
    citation_urls = []
    for citation in citations:
        if isinstance(citation, str):
            citation_urls.append(citation)
        elif hasattr(citation, 'url'):
            citation_urls.append(citation.url)
        elif isinstance(citation, dict) and 'url' in citation:
            citation_urls.append(citation['url'])
    
    if not citation_urls:
        return "No citations"
    
    # Join with newline for multi-line display in Excel cell
    return "\n".join(citation_urls)

def process_single_domain(domain: str, domain_index: int, total_domains: int) -> dict:
    """Process a single domain and return results."""
    thread_id = threading.current_thread().name
    thread_safe_log('info', f"\n[Thread {thread_id}] STARTING DOMAIN [{domain_index}/{total_domains}]: {domain}")
    
    try:
        output_filename = os.path.join(OUTPUT_FOLDER, f"{domain}_data.xlsx")
        queries = generate_queries_for_domain(domain)
        
        all_domain_urls = set()
        total_cost = 0.0
        
        # Prepare data for Excel
        excel_data = []
        
        total_queries = len(queries)
        for i, item in enumerate(queries):
            title = item['title']
            query = item['query']
            
            # Determine which model to use
            model = "sonar-pro" if title in SONAR_PRO_QUERIES else "sonar"
            
            thread_safe_log('info', f"[Thread {thread_id}] [{i+1}/{total_queries}] Processing '{title}' for {domain} (model: {model})...")
            
            response_data = perplexity_chat_completions_client(query=query, domain=domain, model=model)
            
            if response_data:
                if response_data.get('usage') and hasattr(response_data['usage'], 'cost'):
                    cost = response_data['usage'].cost
                    if hasattr(cost, 'total_cost'):
                        total_cost += float(cost.total_cost)
                
                # Get content and format citations
                content = response_data.get('content', 'no information found')
                citations = format_citations(response_data.get('citations', []))
                
                # Add to Excel data
                excel_data.append({
                    'Title': title,
                    'Response Content': content,
                    'Citations': citations
                })
                
                # Still track URLs for statistics
                if response_data.get('search_results'):
                    for result in response_data['search_results']:
                        if hasattr(result, 'url') and result.url:
                            normalized_url = result.url.rstrip('/').lower()
                            if normalized_url not in all_domain_urls:
                                all_domain_urls.add(normalized_url)
                
                thread_safe_log('info', f"  [Thread {thread_id}] Processed '{title}'")
            else:
                excel_data.append({
                    'Title': title,
                    'Response Content': 'no information found',
                    'Citations': 'No citations'
                })
                thread_safe_log('info', f"  [Thread {thread_id}] No response data for '{title}'")
            
            time.sleep(0.5)
        
        # Create DataFrame and save to Excel
        df = pd.DataFrame(excel_data)
        
        # Save to Excel with formatting
        with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='University Data')
            
            # Get the worksheet to apply formatting
            worksheet = writer.sheets['University Data']
            
            # Adjust column widths
            worksheet.column_dimensions['A'].width = 30  # Title
            worksheet.column_dimensions['B'].width = 80  # Response Content
            worksheet.column_dimensions['C'].width = 60  # Citations
            
            # Enable text wrapping for all cells
            from openpyxl.styles import Alignment
            for row in worksheet.iter_rows(min_row=2, max_row=len(excel_data)+1):
                for cell in row:
                    cell.alignment = Alignment(wrap_text=True, vertical='top')
            
            # Bold header row
            from openpyxl.styles import Font
            for cell in worksheet[1]:
                cell.font = Font(bold=True)
                cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

        result = {
            'domain': domain,
            'domain_index': domain_index,
            'output_filename': output_filename,
            'total_queries': len(excel_data),
            'total_cost': total_cost,
            'thread_id': thread_id
        }
        
        thread_safe_log('info', f"[Thread {thread_id}] COMPLETED DOMAIN: {domain} | Queries: {len(excel_data)} | Cost: ${total_cost:.4f}")
        return result
        
    except Exception as e:
        thread_safe_log('error', f"[Thread {thread_id}] ERROR processing {domain}: {e}")
        return {
            'domain': domain,
            'domain_index': domain_index,
            'error': str(e),
            'thread_id': thread_id
        }

def process_domains_in_parallel(domains_batch: list, batch_start_index: int) -> list:
    """Process a batch of domains in parallel using ThreadPoolExecutor."""
    thread_safe_log('info', f"\n{'='*80}")
    thread_safe_log('info', f"STARTING PARALLEL BATCH: {len(domains_batch)} domains")
    thread_safe_log('info', f"Domains: {', '.join(domains_batch[:5])}{'...' if len(domains_batch) > 5 else ''}")
    thread_safe_log('info', f"{'='*80}")
    
    results = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_domain = {}
        for i, domain in enumerate(domains_batch):
            domain_index = batch_start_index + i + 1
            future = executor.submit(process_single_domain, domain, domain_index, len(domains_batch))
            future_to_domain[future] = domain
        
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                thread_safe_log('error', f"ERROR: Exception occurred for domain {domain}: {e}")
                results.append({'domain': domain, 'error': str(e)})
    
    return results

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        logger.error("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    target_domains = get_target_domains()
    
    if not target_domains:
        logger.error("No domains found in target list. Exiting.")
        return

    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)
    logger.info(f"Output folder '{OUTPUT_FOLDER}' created/verified")

    logger.info(f"\nStarting parallel processing with {MAX_WORKERS} threads")
    logger.info(f"Processing {len(target_domains)} domains in batches of {BATCH_SIZE}")

    total_queries_processed = 0
    total_cost_collected = 0.0
    successful_domains = 0
    failed_domains = 0
    
    for batch_start in range(0, len(target_domains), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(target_domains))
        domains_batch = target_domains[batch_start:batch_end]
        
        batch_number = (batch_start // BATCH_SIZE) + 1
        total_batches = (len(target_domains) + BATCH_SIZE - 1) // BATCH_SIZE
        
        logger.info(f"\n{'*'*80}")
        logger.info(f"PROCESSING BATCH {batch_number}/{total_batches}")
        logger.info(f"Domains {batch_start + 1} to {batch_end} of {len(target_domains)}")
        logger.info(f"{'*'*80}")
        
        batch_results = process_domains_in_parallel(domains_batch, batch_start)
        
        batch_queries_count = 0
        batch_cost = 0.0
        
        for result in batch_results:
            if 'error' not in result:
                batch_queries_count += result.get('total_queries', 0)
                batch_cost += result.get('total_cost', 0.0)
                successful_domains += 1
                logger.info(f"✓ {result['domain']}: {result['total_queries']} Queries | ${result.get('total_cost', 0.0):.4f}")
            else:
                failed_domains += 1
                logger.error(f"✗ {result['domain']}: ERROR - {result.get('error', 'Unknown error')}")
        
        total_queries_processed += batch_queries_count
        total_cost_collected += batch_cost
        
        logger.info(f"\nBatch {batch_number} Summary:")
        logger.info(f"  Batch Queries: {batch_queries_count}")
        logger.info(f"  Batch Cost: ${batch_cost:.4f}")
        logger.info(f"  Running Total Queries: {total_queries_processed}")
        logger.info(f"  Running Total Cost: ${total_cost_collected:.4f}")

    logger.info("\n" + "="*80)
    logger.info("ALL DOMAINS PROCESSING COMPLETED!")
    logger.info("="*80)
    logger.info(f"Total domains processed: {len(target_domains)}")
    logger.info(f"Successful domains: {successful_domains}")
    logger.info(f"Failed domains: {failed_domains}")
    logger.info(f"Total Queries processed: {total_queries_processed}")
    logger.info(f"Total Cost: ${total_cost_collected:.4f}")
    logger.info(f"All output files saved in: {OUTPUT_FOLDER}")
    logger.info("="*80)

if __name__ == "__main__":
    main()


2025-10-05 11:55:58,863 [MainThread] INFO: 
Using predefined target domains:
2025-10-05 11:55:58,865 [MainThread] INFO:   1. ox.ac.uk
2025-10-05 11:55:58,866 [MainThread] INFO:   2. harvard.edu
2025-10-05 11:55:58,868 [MainThread] INFO: 
Total domains to process: 2
2025-10-05 11:55:58,871 [MainThread] INFO: Output folder 'universities_informations' created/verified
2025-10-05 11:55:58,872 [MainThread] INFO: 
Starting parallel processing with 30 threads
2025-10-05 11:55:58,874 [MainThread] INFO: Processing 2 domains in batches of 30
2025-10-05 11:55:58,875 [MainThread] INFO: 
********************************************************************************
2025-10-05 11:55:58,876 [MainThread] INFO: PROCESSING BATCH 1/1
2025-10-05 11:55:58,877 [MainThread] INFO: Domains 1 to 2 of 2
2025-10-05 11:55:58,878 [MainThread] INFO: ********************************************************************************
2025-10-05 11:55:58,879 [MainThread] INFO: 
2025-10-05 11:55:58,880 [MainThread] INFO:

✓ PERPLEXITY_API_KEY loaded successfully (starts with: pplx-OUK...)


2025-10-05 11:56:02,485 [ThreadPoolExecutor-0_1] INFO: HTTP Request: POST https://api.perplexity.ai/chat/completions "HTTP/1.1 200 OK"
2025-10-05 11:56:02,502 [ThreadPoolExecutor-0_1] INFO:   [Thread ThreadPoolExecutor-0_1] Processed 'About Us'
2025-10-05 11:56:02,645 [ThreadPoolExecutor-0_0] INFO: HTTP Request: POST https://api.perplexity.ai/chat/completions "HTTP/1.1 200 OK"
2025-10-05 11:56:02,649 [ThreadPoolExecutor-0_0] INFO:   [Thread ThreadPoolExecutor-0_0] Processed 'About Us'
2025-10-05 11:56:03,003 [ThreadPoolExecutor-0_1] INFO: [Thread ThreadPoolExecutor-0_1] [2/47] Processing 'Vision' for harvard.edu (model: sonar)...
2025-10-05 11:56:03,151 [ThreadPoolExecutor-0_0] INFO: [Thread ThreadPoolExecutor-0_0] [2/47] Processing 'Vision' for ox.ac.uk (model: sonar)...
2025-10-05 11:56:06,397 [ThreadPoolExecutor-0_0] INFO: HTTP Request: POST https://api.perplexity.ai/chat/completions "HTTP/1.1 200 OK"
2025-10-05 11:56:06,402 [ThreadPoolExecutor-0_0] INFO:   [Thread ThreadPoolExecuto