itsalissonsilva commited on
Commit
f907e1a
·
verified ·
1 Parent(s): 37559e5

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +12 -13
src/streamlit_app.py CHANGED
@@ -15,16 +15,12 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
15
 
16
  PROMPT_INSTRUCTIONS_TEXT = """
17
  You are a forensic auditor AI with deep domain expertise and a sharp eye for irregularities. Your job is to identify **anomalies** in a single column of financial data.
18
-
19
  Analyze the values provided and return only values that are:
20
  - **Numerical outliers**: extremely high/low or oddly rounded numbers
21
  - **Format inconsistencies**: strange symbols, irregular formatting, or data corruption
22
  - **Rare or suspicious values**: strings or categories that do not appear to fit the overall pattern
23
-
24
  ONLY analyze the values from the provided column, without relying on any external context.
25
-
26
  Return ONLY the following JSON object and nothing else:
27
-
28
  {
29
  "anomalies": [
30
  {
@@ -91,27 +87,31 @@ st.markdown("""
91
  This tool combines machine learning and large language models to detect anomalies in datasets. We first apply isolation forest to the full dataset to flag data-level outliers. Then, you can select one column to perform a second pass of analysis using OpenAI's GPT-4, which focuses on semantic and contextual anomalies within that column only (e.g. Payment_Method column).
92
  """)
93
 
94
- # Button to load sample data
95
- df = None
96
- sample_loaded = False
 
 
97
  if st.button("Load sample dataset"):
98
- sample_path = "src/df_crypto.csv"
99
  try:
100
- df = pd.read_csv(sample_path)
101
- sample_loaded = True
102
  st.success("Sample dataset loaded from `src/df_crypto.csv`.")
103
  except Exception as e:
104
  st.error(f"Could not load sample dataset: {e}")
105
 
106
  # File upload
107
- if not sample_loaded:
108
  uploaded_file = st.file_uploader("Or upload your own CSV file", type=["csv"])
109
  if uploaded_file:
110
  try:
111
- df = pd.read_csv(uploaded_file)
 
112
  except Exception as e:
113
  st.error(f"Could not read uploaded CSV. Error: {e}")
114
 
 
 
 
115
  if df is not None:
116
  st.subheader("Full Dataset")
117
  st.dataframe(df, use_container_width=True)
@@ -126,7 +126,6 @@ if df is not None:
126
 
127
  # ---------------- LLM Section ----------------
128
  st.markdown("### LLM-Based Anomaly Detection (specific column)")
129
-
130
  selected_column = st.selectbox("Select a column to analyze with LLM:", df.columns)
131
 
132
  if st.button("Run LLM Anomaly Detection on selected column"):
 
15
 
16
  PROMPT_INSTRUCTIONS_TEXT = """
17
  You are a forensic auditor AI with deep domain expertise and a sharp eye for irregularities. Your job is to identify **anomalies** in a single column of financial data.
 
18
  Analyze the values provided and return only values that are:
19
  - **Numerical outliers**: extremely high/low or oddly rounded numbers
20
  - **Format inconsistencies**: strange symbols, irregular formatting, or data corruption
21
  - **Rare or suspicious values**: strings or categories that do not appear to fit the overall pattern
 
22
  ONLY analyze the values from the provided column, without relying on any external context.
 
23
  Return ONLY the following JSON object and nothing else:
 
24
  {
25
  "anomalies": [
26
  {
 
87
  This tool combines machine learning and large language models to detect anomalies in datasets. We first apply isolation forest to the full dataset to flag data-level outliers. Then, you can select one column to perform a second pass of analysis using OpenAI's GPT-4, which focuses on semantic and contextual anomalies within that column only (e.g. Payment_Method column).
88
  """)
89
 
90
+ # Initialize session state for df
91
+ if "df" not in st.session_state:
92
+ st.session_state.df = None
93
+
94
+ # Load sample data
95
  if st.button("Load sample dataset"):
 
96
  try:
97
+ st.session_state.df = pd.read_csv("src/df_crypto.csv")
 
98
  st.success("Sample dataset loaded from `src/df_crypto.csv`.")
99
  except Exception as e:
100
  st.error(f"Could not load sample dataset: {e}")
101
 
102
  # File upload
103
+ if st.session_state.df is None:
104
  uploaded_file = st.file_uploader("Or upload your own CSV file", type=["csv"])
105
  if uploaded_file:
106
  try:
107
+ st.session_state.df = pd.read_csv(uploaded_file)
108
+ st.success("Custom dataset uploaded.")
109
  except Exception as e:
110
  st.error(f"Could not read uploaded CSV. Error: {e}")
111
 
112
+ # Use persisted df
113
+ df = st.session_state.df
114
+
115
  if df is not None:
116
  st.subheader("Full Dataset")
117
  st.dataframe(df, use_container_width=True)
 
126
 
127
  # ---------------- LLM Section ----------------
128
  st.markdown("### LLM-Based Anomaly Detection (specific column)")
 
129
  selected_column = st.selectbox("Select a column to analyze with LLM:", df.columns)
130
 
131
  if st.button("Run LLM Anomaly Detection on selected column"):