diff --git a/.gitignore b/.gitignore index 7265aec..b129e14 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +.streamlit/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..1298ead --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Streamlit", + "type": "debugpy", + "request": "launch", + "module": "streamlit", + "args": [ + "run", + "${file}" + ], + "justMyCode": false, + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 6996e68..3f6098d 100644 --- a/README.md +++ b/README.md @@ -6,24 +6,6 @@ #### Generate publication highlights using AI -### Setting Up OpenAI API Key - -To use the OpenAI API, you need to set up an API key and set it as an environment variable. - -1. Obtain your OpenAI API key from the [OpenAI website](https://platform.openai.com/api-keys). - -2. Set the API key as an environment variable: - - On Windows: - ```bash - set OPENAI_API_KEY=your_api_key_here - ``` - - On macOS and Linux: - ```bash - export OPENAI_API_KEY=your_api_key_here - ``` - -Replace `your_api_key_here` with your actual OpenAI API key. - ### Installation #### Clone this repository @@ -59,7 +41,7 @@ To install this Python package in a virtual environment, you can use either pip #### Using Anaconda -1. Create a virtual environment: +1. Create a virtual environment (**Note:** Support for models provided by Dartmouth requires Python 3.10 or higher.): ```bash conda create --name highlight_env python=3.9 ``` @@ -69,10 +51,44 @@ To install this Python package in a virtual environment, you can use either pip conda activate highlight_env ``` -3. Install the package from the cloned `highlight` directory: +3. Install the package from the cloned `highlight` directory. ```bash pip install . ``` + By default, only OpenAI's models are supported. You can add support for other providers by installing one ore more of the package's optional dependencies: + ```bash + pip install ".[anthropic, dartmouth, google, mistral, ollama]" + ``` + +### Configuring the LLMs +You can configure a variety of LLMs, both remote and local, to choose from in the app. To do this, edit the file `config.toml` in the root of this repo. + +#### Setting Up API Keys +If any of the configured models require an API key, you need to set them up as Streamlit secrets. + +First, you have to acquire the API key from the provider of your choice (e.g., [OpenAI](https://platform.openai.com/api-keys), [Google](https://ai.google.dev/gemini-api/docs/api-key), [Dartmouth](https://developer.dartmouth.edu/keys), [Mistral](https://console.mistral.ai/api-keys/), or [Anthropic](https://console.anthropic.com/settings/keys)). + +Then, create a folder named `.streamlit` in the root of this repo. Inside that folder, create a file named `secrets.toml`. + +Finally, store your API key(s) in the following format: + +```toml +KEY_NAME = "your_api_key_here" +``` + +Replace `KEY_NAME` with the variable name corresponding to your provider: + +| Provider | Variable name | +| -------- | ------- | +| Anthropic | `ANTHROPIC_API_KEY` | +| Dartmouth | `DARTMOUTH_API_KEY` | +| Google | `GOOGLE_API_KEY` | +| Mistral | `MISTRAL_API_KEY` | +| OpenAI | `OPENAI_API_KEY` | + + +Replace `your_api_key_here` with your actual API key. + ### Running the App diff --git a/app.py b/app.py index d42f03f..0732410 100644 --- a/app.py +++ b/app.py @@ -6,21 +6,14 @@ from pptx import Presentation from pptx.util import Pt from pptx.enum.text import PP_ALIGN -from openai import OpenAI + import streamlit as st import highlight as hlt +if "config" not in st.session_state: + st.session_state["config"] = hlt.read_config("config.toml") -if "client" not in st.session_state: - key = os.getenv("OPENAI_API_KEY", default=None) - if key is None: - raise KeyError(( - "No key found for 'OPENAI_API_KEY' system variable. " + - "Obtain your OpenAI API key from the OpenAI website: https://platform.openai.com/api-keys" - )) - else: - st.session_state.client = OpenAI(api_key=key) if "reduce_document" not in st.session_state: st.session_state.reduce_document = False @@ -125,28 +118,25 @@ # Render streamlit page st.title("Research Highlight Generator") -st.markdown(( - "This app uses a Large Language Model (LLM) of your choosing to generate " + - " formatted research highlight content from an input file." -)) - -st.session_state.model = st.selectbox( - label="Select your model:", - options=("gpt-4o", "gpt-4", "gpt-3.5-turbo-16k", "gpt-3.5-turbo") +st.markdown( + ( + "This app uses a Large Language Model (LLM) of your choosing to generate " + + " formatted research highlight content from an input file." + ) ) -if st.session_state.model == "gpt-4-32k": - st.session_state.max_allowable_tokens = 32768 -elif st.session_state.model == "gpt-4": - st.session_state.max_allowable_tokens = 8192 -elif st.session_state.model == "gpt-3.5-turbo-16k": - st.session_state.max_allowable_tokens = 16384 -elif st.session_state.model == "gpt-3.5-turbo": - st.session_state.max_allowable_tokens = 4096 -elif st.session_state.model == "gpt-4o": - st.session_state.max_allowable_tokens = 150000 - -# set api key +llm_id = st.selectbox( + label="Select your model:", options=hlt.llm.list_llms(st.session_state["config"]) +) +st.session_state.provider, st.session_state.model = llm_id.split(": ") +st.session_state.max_allowable_tokens = st.session_state.config["llm"][ + st.session_state.provider +][st.session_state.model]["max_allowable_tokens"] + +st.session_state.client = hlt.llm.get_llm( + st.session_state.provider, + max_context_length=st.session_state.max_allowable_tokens, +) st.markdown("### Upload file to process:") uploaded_file = st.file_uploader( @@ -165,31 +155,33 @@ st.session_state.output_file = uploaded_file.name - st.code(f"""File specs:\n + st.code( + f"""File specs:\n - Number of pages: {content_dict['n_pages']} - Number of characters: {content_dict['n_characters']} - Number of words: {content_dict['n_words']} - Number of tokens: {content_dict['n_tokens']} - """) + """ + ) - if content_dict['n_tokens'] > st.session_state.max_allowable_tokens: + if content_dict["n_tokens"] > st.session_state.max_allowable_tokens: msg = f""" The number of tokens in your document exceeds the maximum allowable tokens. This will cause your queries to fail. The queries account for the number of tokens in a prompt + the number of tokens in your document. - + Maximum allowable token count: {st.session_state.max_allowable_tokens} - + Your documents token count: {content_dict['n_tokens']} - + Token deficit: {content_dict['n_tokens'] - st.session_state.max_allowable_tokens} """ st.error(msg, icon="🚨") st.session_state.reduce_document = st.radio( - """Would you like me to attempt to reduce the size of - your document by keeping only relevant information? - If so, I will give you a file to download with the content + """Would you like me to attempt to reduce the size of + your document by keeping only relevant information? + If so, I will give you a file to download with the content so you only have to do this once. If you choose to go through with this, it may take a while to process, usually on the order of 15 minutes for a 20K token @@ -197,7 +189,7 @@ Alternatively, you can copy and paste the contents that you know are of interest into a text file and upload that instead. - + """, ("Yes", "No"), ) @@ -210,28 +202,26 @@ title_container.markdown("##### Generate title from text content") # title criteria - title_container.markdown(""" + title_container.markdown( + """ The title should meet the following criteria: - No colons are allowed in the output. - Should pique the interest of the reader while still being somewhat descriptive. - Be understandable to a general audience. - Should be only once sentence. - Should have a maximum length of 10 words. - """) + """ + ) title_container.markdown("Set desired temperature:") # title slider title_temperature = title_container.slider( - "Title Temperature", - 0.0, - 1.0, - 0.2, - label_visibility="collapsed" + "Title Temperature", 0.0, 1.0, 0.2, label_visibility="collapsed" ) # build container content - if title_container.button('Generate Title'): + if title_container.button("Generate Title"): st.session_state.title_response = hlt.generate_content( client=st.session_state.client, @@ -243,7 +233,7 @@ temperature=title_temperature, box_height=50, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -253,7 +243,7 @@ label="Title Result:", value=st.session_state.title_response, label_visibility="collapsed", - height=50 + height=50, ) # subtitle section @@ -261,29 +251,29 @@ subtitle_container.markdown("##### Generate subtitle from text content") # subtitle criteria - subtitle_container.markdown(""" + subtitle_container.markdown( + """ The subtitle should meet the following criteria: - Be an extension of and related to, but not directly quote, the title. - Provide information that will make the audience want to find out more about the research. - Do not use more than 155 characters including spaces. - """) + """ + ) subtitle_container.markdown("Set desired temperature:") # subtitle slider subtitle_temperature = subtitle_container.slider( - "Subtitle Temperature", - 0.0, - 1.0, - 0.5, - label_visibility="collapsed" + "Subtitle Temperature", 0.0, 1.0, 0.5, label_visibility="collapsed" ) # build container content - if subtitle_container.button('Generate Subtitle'): + if subtitle_container.button("Generate Subtitle"): if st.session_state.title_response is None: - st.write("Please generate a Title first. Subtitle generation considers the title response.") + st.write( + "Please generate a Title first. Subtitle generation considers the title response." + ) else: st.session_state.subtitle_response = hlt.generate_content( @@ -299,7 +289,7 @@ max_word_count=100, min_word_count=75, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -309,7 +299,7 @@ label="Subtitle Result:", value=st.session_state.subtitle_response, label_visibility="collapsed", - height=50 + height=50, ) # science section @@ -317,9 +307,10 @@ science_container.markdown("##### Generate science summary from text content") # science criteria - science_container.markdown(""" + science_container.markdown( + """ **GOAL**: Describe the scientific results for a non-expert, non-scientist audience. - + The description should meet the following criteria: - Answer what the big challenge in this field of science is that the research addresses. - State what the key finding is. @@ -327,26 +318,23 @@ - Be understandable to a high school senior or college freshman. - Use short sentences and succinct words. - Avoid technical terms if possible. If technical terms are necessary, define them. - - Provide the necessary context so someone can have a very basic understanding of what you did. + - Provide the necessary context so someone can have a very basic understanding of what you did. - Start with topics that the reader already may know and move on to more complex ideas. - Use present tense. - In general, the description should speak about the research or researchers in first person. - - Use a minimum of 75 words and a maximum of 100 words. - """) + - Use a minimum of 75 words and a maximum of 100 words. + """ + ) science_container.markdown("Set desired temperature:") # slider science_temperature = science_container.slider( - "Science Summary Temperature", - 0.0, - 1.0, - 0.3, - label_visibility="collapsed" + "Science Summary Temperature", 0.0, 1.0, 0.3, label_visibility="collapsed" ) # build container content - if science_container.button('Generate Science Summary'): + if science_container.button("Generate Science Summary"): st.session_state.science_response = hlt.generate_content( client=st.session_state.client, container=science_container, @@ -359,7 +347,7 @@ max_word_count=100, min_word_count=75, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -369,44 +357,41 @@ label="Science Summary Result:", value=st.session_state.science_response, label_visibility="collapsed", - height=250 + height=250, ) # impact section impact_container = st.container() impact_container.markdown("##### Generate impact summary from text content") - impact_container.markdown(""" + impact_container.markdown( + """ **GOAL**: Describe the impact of the research to a non-expert, non-scientist audience. - + The description should meet the following criteria: - Answer why the findings presented are important, i.e., what problem the research is trying to solve. - Answer if the finding is the first of its kind. - Answer what was innovative or distinct about the research. - Answer what the research enables other scientists in your field to do next. - - Include other scientific fields potentially impacted. - - Be understandable to a high school senior or college freshman. + - Include other scientific fields potentially impacted. + - Be understandable to a high school senior or college freshman. - Use short sentences and succinct words. - Avoid technical terms if possible. If technical terms are necessary, define them. - Use present tense. - In general, the description should speak about the research or researchers in first person. - - Use a minimum of 75 words and a maximum of 100 words. - """) - + - Use a minimum of 75 words and a maximum of 100 words. + """ + ) impact_container.markdown("Set desired temperature:") # slider impact_temperature = impact_container.slider( - "Impact Summary Temperature", - 0.0, - 1.0, - 0.0, - label_visibility="collapsed" + "Impact Summary Temperature", 0.0, 1.0, 0.0, label_visibility="collapsed" ) # build container content - if impact_container.button('Generate Impact Summary'): + if impact_container.button("Generate Impact Summary"): st.session_state.impact_response = hlt.generate_content( client=st.session_state.client, container=impact_container, @@ -419,7 +404,7 @@ max_word_count=100, min_word_count=75, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -429,40 +414,38 @@ label="Impact Summary Result:", value=st.session_state.impact_response, label_visibility="collapsed", - height=250 + height=250, ) # general summary section summary_container = st.container() summary_container.markdown("##### Generate general summary from text content") - summary_container.markdown(""" + summary_container.markdown( + """ **GOAL**: Generate a general summary of the current research. - + The summary should meet the following criteria: - Should relay key findings and value. - - The summary should be still accessible to the non-specialist but may be more technical if necessary. - - Do not mention the names of institutions. - - If there is a United States Department of Energy Office of Science user facility involved, such as NERSC, you can mention the user facility. + - The summary should be still accessible to the non-specialist but may be more technical if necessary. + - Do not mention the names of institutions. + - If there is a United States Department of Energy Office of Science user facility involved, such as NERSC, you can mention the user facility. - Should be 1 or 2 paragraphs detailing the research. - Use present tense. - In general, the description should speak about the research or researchers in first person. - Use no more than 200 words. - """) + """ + ) summary_container.markdown("Set desired temperature:") # slider summary_temperature = summary_container.slider( - "General Summary Temperature", - 0.0, - 1.0, - 0.3, - label_visibility="collapsed" + "General Summary Temperature", 0.0, 1.0, 0.3, label_visibility="collapsed" ) # build container content - if summary_container.button('Generate General Summary'): + if summary_container.button("Generate General Summary"): st.session_state.summary_response = hlt.generate_content( client=st.session_state.client, container=summary_container, @@ -475,7 +458,7 @@ max_word_count=200, min_word_count=100, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -485,12 +468,14 @@ label="General Summary Result:", value=st.session_state.summary_response, label_visibility="collapsed", - height=400 + height=400, ) # figure recommendations section figure_container = st.container() - figure_container.markdown("##### Generate figure search string recommendations from the general summary") + figure_container.markdown( + "##### Generate figure search string recommendations from the general summary" + ) figure_container.markdown("Set desired temperature:") # slider @@ -499,11 +484,11 @@ 0.0, 1.0, 0.9, - label_visibility="collapsed" + label_visibility="collapsed", ) # build container content - if figure_container.button('Generate Figure Recommendations'): + if figure_container.button("Generate Figure Recommendations"): if st.session_state.summary_response is None: st.write("Please generate a general summary first.") @@ -518,7 +503,7 @@ temperature=figure_temperature, box_height=200, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -529,10 +514,9 @@ label="Figure Recommendations Result:", value=st.session_state.figure_response, label_visibility="collapsed", - height=200 + height=200, ) - figure_summary_container = st.container() figure_summary_container.markdown( "##### Generate a figure caption that summarizes the work generally to use with the artistic photo above" @@ -541,15 +525,11 @@ # slider figure_summary_container.markdown("Set desired temperature:") figure_summary_temperature = figure_summary_container.slider( - "Figure Caption Temperature", - 0.0, - 1.0, - 0.1, - label_visibility="collapsed" + "Figure Caption Temperature", 0.0, 1.0, 0.1, label_visibility="collapsed" ) # build container content - if figure_summary_container.button('Generate Figure Caption'): + if figure_summary_container.button("Generate Figure Caption"): if st.session_state.summary_response is None: st.write("Please generate a general summary first.") @@ -564,7 +544,7 @@ temperature=figure_temperature, box_height=200, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ).replace('"', "") else: @@ -574,14 +554,14 @@ label="Figure Caption Result:", value=st.session_state.figure_caption, label_visibility="collapsed", - height=200 + height=200, ) # citation recommendations section citation_container = st.container() citation_container.markdown("##### Citation for the paper in Chicago style") - - if citation_container.button('Generate Citation'): + + if citation_container.button("Generate Citation"): st.session_state.citation = hlt.generate_content( client=st.session_state.client, container=citation_container, @@ -592,7 +572,7 @@ temperature=0.0, box_height=200, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ).replace('"', "") else: @@ -601,14 +581,14 @@ label="Citation", value=st.session_state.citation, label_visibility="collapsed", - height=200 + height=200, ) # funding recommendations section funding_container = st.container() funding_container.markdown("##### Funding statement from the paper") - - if funding_container.button('Generate funding statement'): + + if funding_container.button("Generate funding statement"): st.session_state.funding = hlt.generate_content( client=st.session_state.client, container=funding_container, @@ -619,7 +599,7 @@ temperature=0.0, box_height=200, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ).replace('"', "") else: @@ -628,7 +608,7 @@ label="Funding statement", value=st.session_state.funding, label_visibility="collapsed", - height=200 + height=200, ) # point of contact box @@ -637,20 +617,19 @@ # select the POC information from the dropdown st.session_state.point_of_contact = st.session_state.project_dict[ - poc_container.selectbox( + poc_container.selectbox( label="Select the project who funded the work:", options=[ - "COMPASS-GLM", - "GCIMS", + "COMPASS-GLM", + "GCIMS", "ICoM", "IM3", "Puget Sound", "Other", - ] + ], ) ] - poc_container.write("What will be written to the document as the point of contact:") poc_parts = st.session_state.point_of_contact.split("\n") poc_container.success( @@ -666,23 +645,25 @@ # template parameters word_parameters = { - 'title': st.session_state.title_response, - 'subtitle': st.session_state.subtitle_response, - 'photo': st.session_state.photo, - 'photo_link': st.session_state.photo_link, - 'photo_site_name': st.session_state.photo_site_name, - 'image_caption': st.session_state.figure_caption, - 'science': st.session_state.science_response, - 'impact': st.session_state.impact_response, - 'summary': st.session_state.summary_response, - 'funding': st.session_state.funding, - 'citation': st.session_state.citation, - 'related_links': st.session_state.related_links, - 'point_of_contact': st.session_state.point_of_contact, + "title": st.session_state.title_response, + "subtitle": st.session_state.subtitle_response, + "photo": st.session_state.photo, + "photo_link": st.session_state.photo_link, + "photo_site_name": st.session_state.photo_site_name, + "image_caption": st.session_state.figure_caption, + "science": st.session_state.science_response, + "impact": st.session_state.impact_response, + "summary": st.session_state.summary_response, + "funding": st.session_state.funding, + "citation": st.session_state.citation, + "related_links": st.session_state.related_links, + "point_of_contact": st.session_state.point_of_contact, } # template word document - word_template_file = importlib.resources.files('highlight.data').joinpath('highlight_template.docx') + word_template_file = importlib.resources.files("highlight.data").joinpath( + "highlight_template.docx" + ) template = DocxTemplate(word_template_file) template.render(word_parameters) @@ -693,7 +674,7 @@ label="Export Word Document", data=bio.getvalue(), file_name="modified_template.docx", - mime="docx" + mime="docx", ) # power point slide content @@ -703,28 +684,26 @@ objective_container = st.container() objective_container.markdown("##### Generate objective summary from text content") - objective_container.markdown(""" + objective_container.markdown( + """ **GOAL**: Generate one sentence stating the core purpose of the study. - + The sentence should meet the following criteria: - Use active verbs for the start of each point. - Use present tense. - Do not include methodology related to statistical, technological, and theory based - """) + """ + ) objective_container.markdown("Set desired temperature:") # slider objective_temperature = objective_container.slider( - "Objective Temperature", - 0.0, - 1.0, - 0.3, - label_visibility="collapsed" + "Objective Temperature", 0.0, 1.0, 0.3, label_visibility="collapsed" ) # build container content - if objective_container.button('Generate Objective'): + if objective_container.button("Generate Objective"): st.session_state.objective_response = hlt.generate_content( client=st.session_state.client, container=objective_container, @@ -735,7 +714,7 @@ temperature=objective_temperature, box_height=250, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -745,35 +724,33 @@ label="Objective Result:", value=st.session_state.objective_response, label_visibility="collapsed", - height=250 + height=250, ) # approach section approach_container = st.container() approach_container.markdown("##### Generate approach summary from text content") - approach_container.markdown(""" + approach_container.markdown( + """ **GOAL**: Clearly and concisely state in 2-3 short points how this work accomplished the stated objective from a methodolgocial perspecive. - - Based off of the objective summary - - Only include methodology including but not limited to: statistical, technological, and theory based approaches. + - Based off of the objective summary + - Only include methodology including but not limited to: statistical, technological, and theory based approaches. - Use a different action verb to start sentences than what is used to begin the objective statement. - - Use active verbs for the start of each point. + - Use active verbs for the start of each point. - Use present tense. - """) + """ + ) approach_container.markdown("Set desired temperature:") # slider approach_temperature = approach_container.slider( - "Approach Temperature", - 0.0, - 1.0, - 0.1, - label_visibility="collapsed" + "Approach Temperature", 0.0, 1.0, 0.1, label_visibility="collapsed" ) # build container content - if approach_container.button('Generate Approach'): + if approach_container.button("Generate Approach"): st.session_state.approach_response = hlt.generate_content( client=st.session_state.client, container=approach_container, @@ -785,7 +762,7 @@ box_height=250, additional_content=st.session_state.objective_response, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -795,15 +772,16 @@ label="Approach Result:", value=st.session_state.approach_response, label_visibility="collapsed", - height=250 + height=250, ) # power point impact section ppt_impact_container = st.container() ppt_impact_container.markdown("##### Generate impact points from text content") - ppt_impact_container.markdown(""" - **GOAL**: Clearly and concisely state in 3 points the key results and outcomes from this research. + ppt_impact_container.markdown( + """ + **GOAL**: Clearly and concisely state in 3 points the key results and outcomes from this research. - State what the results indicate. - Include results that may be considered profound or surprising. - Each point should be 1 concise sentence. @@ -815,15 +793,11 @@ # slider ppt_impact_temperature = ppt_impact_container.slider( - "Impact Points Temperature", - 0.0, - 1.0, - 0.1, - label_visibility="collapsed" + "Impact Points Temperature", 0.0, 1.0, 0.1, label_visibility="collapsed" ) # build container content - if ppt_impact_container.button('Generate Impact Points'): + if ppt_impact_container.button("Generate Impact Points"): st.session_state.ppt_impact_response = hlt.generate_content( client=st.session_state.client, container=ppt_impact_container, @@ -834,7 +808,7 @@ temperature=ppt_impact_temperature, box_height=250, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -844,35 +818,33 @@ label="Impact Points Result:", value=st.session_state.ppt_impact_response, label_visibility="collapsed", - height=250 + height=250, ) # power point figure selection section ppt_figure_selection = st.container() ppt_figure_selection.markdown("##### Select a representative figure from the paper") - ppt_figure_selection.markdown(""" + ppt_figure_selection.markdown( + """ **GOAL**: What figure best represents the high impact content that can be easily understood by a non-technical, non-scientifc audience. - + Limit the response to: 1. The figure name as it is written in the text, 2. An explanation of why it was chosen, 3. And what the figure is about in less than 50 words. - """) + """ + ) ppt_figure_selection.markdown("Set desired temperature:") # slider ppt_figure_selection_temperature = ppt_figure_selection.slider( - "Figure recommendation Temperature", - 0.0, - 1.0, - 0.2, - label_visibility="collapsed" + "Figure recommendation Temperature", 0.0, 1.0, 0.2, label_visibility="collapsed" ) # build container content - if ppt_figure_selection.button('Generate Figure Recommendation'): + if ppt_figure_selection.button("Generate Figure Recommendation"): st.session_state.figure_recommendation = hlt.generate_content( client=st.session_state.client, container=ppt_figure_selection, @@ -883,7 +855,7 @@ temperature=ppt_figure_selection_temperature, box_height=250, max_allowable_tokens=st.session_state.max_allowable_tokens, - model=st.session_state.model + model=st.session_state.model, ) else: @@ -893,23 +865,29 @@ label="Figure Recommendation Result:", value=st.session_state.figure_recommendation, label_visibility="collapsed", - height=250 + height=250, ) # Add PowerPoint export container at the end export_ppt_container = st.container() - export_ppt_container.markdown("##### Export PowerPoint Presentation with New Content") + export_ppt_container.markdown( + "##### Export PowerPoint Presentation with New Content" + ) - if ("title_response" in st.session_state and - "objective_response" in st.session_state and - "ppt_impact_response" in st.session_state and - "approach_response" in st.session_state): + if ( + "title_response" in st.session_state + and "objective_response" in st.session_state + and "ppt_impact_response" in st.session_state + and "approach_response" in st.session_state + ): - if export_ppt_container.button('Export PowerPoint'): + if export_ppt_container.button("Export PowerPoint"): try: # Load the PowerPoint template - ppt_template_file = importlib.resources.files('highlight.data').joinpath('highlight_template.pptx') + ppt_template_file = importlib.resources.files( + "highlight.data" + ).joinpath("highlight_template.pptx") prs = Presentation(ppt_template_file) # Split the impact and approach responses into bullet points (assuming they are separated by newlines) @@ -927,7 +905,9 @@ # Ensure font size and bold settings are maintained for each paragraph for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: - run.font.size = Pt(24) # Example size, adjust as needed + run.font.size = Pt( + 24 + ) # Example size, adjust as needed run.font.bold = True # Maintain bold run.alignment = PP_ALIGN.LEFT # Align title @@ -938,13 +918,19 @@ # Ensure font size and bold settings are maintained for each paragraph for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: - run.font.size = Pt(11) # Example size for citation, adjust as needed - run.font.bold = False # Citation typically isn't bold + run.font.size = Pt( + 11 + ) # Example size for citation, adjust as needed + run.font.bold = ( + False # Citation typically isn't bold + ) run.alignment = PP_ALIGN.LEFT # Align citation if shape.text_frame.text == "objective_0": # Set the text of the text box to the objective response - shape.text_frame.text = st.session_state.objective_response + shape.text_frame.text = ( + st.session_state.objective_response + ) # Optional: Adjust font size and alignment for the objective for paragraph in shape.text_frame.paragraphs: @@ -957,11 +943,15 @@ shape.text_frame.clear() # Add bullet points for approach - for i, approach_point in enumerate(approach_points[:3]): # Only take the first 3 approach points + for i, approach_point in enumerate( + approach_points[:3] + ): # Only take the first 3 approach points p = shape.text_frame.add_paragraph() p.text = approach_point p.level = 0 # This sets it as a bullet point - p.font.size = Pt(13) # Adjust bullet point font size + p.font.size = Pt( + 13 + ) # Adjust bullet point font size p.alignment = PP_ALIGN.LEFT # Align bullet points # Handle the impact bullet points in the same text box @@ -970,11 +960,15 @@ shape.text_frame.clear() # Add bullet points for impact - for i, impact_point in enumerate(impact_points[:3]): # Only take the first 3 impact points + for i, impact_point in enumerate( + impact_points[:3] + ): # Only take the first 3 impact points p = shape.text_frame.add_paragraph() p.text = impact_point p.level = 0 # This sets it as a bullet point - p.font.size = Pt(13) # Adjust bullet point font size + p.font.size = Pt( + 13 + ) # Adjust bullet point font size p.alignment = PP_ALIGN.LEFT # Align bullet points # Save the modified presentation to a BytesIO object @@ -987,13 +981,20 @@ label="Export PowerPoint Presentation", data=ppt_io, file_name="modified_highlight_template.pptx", - mime="application/vnd.openxmlformats-officedocument.presentationml.presentation" + mime="application/vnd.openxmlformats-officedocument.presentationml.presentation", ) - export_ppt_container.success("PowerPoint presentation generated successfully!", icon="✅") + export_ppt_container.success( + "PowerPoint presentation generated successfully!", icon="✅" + ) except Exception as e: - export_ppt_container.error(f"An error occurred while generating the PowerPoint: {e}", icon="🚨") + export_ppt_container.error( + f"An error occurred while generating the PowerPoint: {e}", icon="🚨" + ) else: - export_ppt_container.error("Please generate the objective and impact responses before exporting.", icon="⚠️") + export_ppt_container.error( + "Please generate the objective and impact responses before exporting.", + icon="⚠️", + ) diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..bbc8e9a --- /dev/null +++ b/config.toml @@ -0,0 +1,58 @@ +[llm] + +[llm.anthropic] +models = ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"] + +[llm.anthropic.claude-3-5-sonnet-20240620] +max_allowable_tokens = 200000 + +[llm.anthropic.claude-3-opus-20240229] +max_allowable_tokens = 200000 + +[llm.anthropic.claude-3-sonnet-20240229] +max_allowable_tokens = 200000 + +[llm.anthropic.claude-3-haiku-20240307] +max_allowable_tokens = 200000 + +[llm.google] +models = ["gemini-1.5-flash-8b"] + +[llm.google."gemini-1.5-flash-8b"] +max_allowable_tokens = 1_000_000 + +[llm.mistral] +models = ["mistral-small-latest"] + +[llm.mistral.mistral-small-latest] +max_allowable_tokens = 32000 + +[llm.openai] +models = ["gpt-4o", "gpt-4", "gpt-3.5-turbo-16k", "gpt-3.5-turbo"] + +[llm.openai.gpt-4o] +max_allowable_tokens = 150000 + +[llm.openai.gpt-4] +max_allowable_tokens = 8192 + +[llm.openai."gpt-3.5-turbo"] +max_allowable_tokens = 4096 + +[llm.openai."gpt-3.5-turbo-16k"] +max_allowable_tokens = 16384 + +[llm.ollama] +models = ["llama3.2:1b", "llama3.1:latest"] + +[llm.ollama."llama3.2:1b"] +max_allowable_tokens = 128000 + +[llm.ollama."llama3.1:latest"] +max_allowable_tokens = 128000 + +[llm.dartmouth] +models = ["llama-3-1-8b-instruct"] + +[llm.dartmouth."llama-3-1-8b-instruct"] +max_allowable_tokens = 95000 \ No newline at end of file diff --git a/highlight/__init__.py b/highlight/__init__.py index d83aa03..11b5dd2 100644 --- a/highlight/__init__.py +++ b/highlight/__init__.py @@ -1,5 +1,6 @@ from highlight.prompts import prompt_queue from highlight.utils import * +import highlight.llm as llm __version__ = "0.1.0" diff --git a/highlight/llm.py b/highlight/llm.py new file mode 100644 index 0000000..37a674b --- /dev/null +++ b/highlight/llm.py @@ -0,0 +1,73 @@ +from typing import Literal +from langchain_core.runnables import Runnable, ConfigurableField + + +def get_llm( + provider: Literal["openai", "ollama", "dartmouth"], max_context_length: int +) -> Runnable: + """Returns a LangChain Runnable interfacing with the specified provider""" + if provider == "anthropic": + from langchain_anthropic import ChatAnthropic + + return ChatAnthropic(model_name="not-specified").configurable_fields( + model=ConfigurableField(id="model"), + max_tokens=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + elif provider == "google": + from langchain_google_genai import ChatGoogleGenerativeAI + + return ChatGoogleGenerativeAI( + model="not-specified", max_tokens=1 + ).configurable_fields( + model=ConfigurableField(id="model"), + max_output_tokens=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + + elif provider == "mistral": + from langchain_mistralai import ChatMistralAI + + return ChatMistralAI().configurable_fields( + model=ConfigurableField(id="model"), + max_tokens=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + + elif provider == "openai": + from langchain_openai import ChatOpenAI + + return ChatOpenAI().configurable_fields( + model_name=ConfigurableField(id="model"), + max_tokens=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + elif provider == "ollama": + from langchain_ollama import ChatOllama + + return ChatOllama( + model="llama3", num_ctx=max_context_length + ).configurable_fields( + model=ConfigurableField(id="model"), + num_predict=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + elif provider == "dartmouth": + from langchain_dartmouth.llms import ChatDartmouth + + return ChatDartmouth().configurable_fields( + model_name=ConfigurableField(id="model"), + max_tokens=ConfigurableField(id="max_tokens"), + temperature=ConfigurableField(id="temperature"), + ) + else: + raise ValueError("Invalid provider") + + +def list_llms(config: dict): + """List all Large Language Models available in the configuration""" + llms = [] + for provider, spec in config["llm"].items(): + for model in spec["models"]: + llms.append(f"{provider}: {model}") + return llms diff --git a/highlight/prompts.py b/highlight/prompts.py index 778f578..72d7561 100644 --- a/highlight/prompts.py +++ b/highlight/prompts.py @@ -5,10 +5,9 @@ prompt_queue = { "system": """You are a technical science editor. You are constructing high impact highlight content from recent publications.""", - "title": """ - Generate a title for the text delimited by triple backticks. - + Generate a title for the text delimited by triple backticks. + The title should meet the following criteria: - No colons are allowed in the output. - Should pique the interest of the reader while still being somewhat descriptive. @@ -18,7 +17,7 @@ - Should have a maximum length of 10 words. - Return only the title. - Do not use words like "revolutionizing" or "unraveling" - + The following is an example to use for formatting only. Do not use it in the response. \ The example is delimited by three pound signs. ### @@ -26,12 +25,12 @@ ### The following is the input text delimited by triple backticks. Do not use colons in the response. + Do not use markdown. ```{0}``` """, - "subtitle": """ Generate a subtitle for the text delimited by triple backticks. - + The subtitle should meet the following criteria: - Strictly do not allow colons in the response text. - Be an extension of and related to, but not directly quote, this title delimited by single backticks `{1}` @@ -40,7 +39,7 @@ - Return only the subtitle. - Only capitalize the first letter of the starting word in the sentence unless the word is a proper noun. - Add a period at the end of the response. - + The following is an example to use for formatting only. Do not use it in the response. \ The example is delimited by three pound signs. ### @@ -48,12 +47,12 @@ ### The following is the input text delimited by triple backticks. Do not use colons in the response. + Do not use markdown. ```{0}``` """, - "science": """ Describe the scientific results for a non-expert, non-scientist audience for the text delimited by triple backticks. - + The description should meet the following criteria: - Answer what the big challenge in this field of science is that the research addresses. - State what the key finding is. @@ -61,78 +60,77 @@ - Be understandable to a high school senior or college freshman. - Use short sentences and succinct words. - Avoid technical terms if possible. If technical terms are necessary, define them. - - Provide the necessary context so someone can have a very basic understanding of what you did. + - Provide the necessary context so someone can have a very basic understanding of what you did. - Start with topics that the reader already may know and move on to more complex ideas. - Use present tense. - In general, the description should speak about the research or researchers in first person. - - Use a minimum of 75 words and a maximum of 100 words. + - Use a minimum of 75 words and a maximum of 100 words. - Produce only one paragraph. - Return only the description. - + - Do not use markdown. + - Do not use triple backticks to delimit your response. + Finally, do not exceed 100 words in the response. - + ```{0}``` """, - "impact": """ Describe the impact of the research to a non-expert, non-scientist audience for the text delimited by triple backticks. - + The description should meet the following criteria: - Answer why the findings presented are important, i.e., what problem the research is trying to solve. - Answer if the finding is the first of its kind. - Answer what was innovative or distinct about the research. - Answer what the research enables other scientists in your field to do next. - - Include other scientific fields potentially impacted. - - Be understandable to a high school senior or college freshman. + - Include other scientific fields potentially impacted. + - Be understandable to a high school senior or college freshman. - Use short sentences and succinct words. - Avoid technical terms if possible. If technical terms are necessary, define them. - Use present tense. - In general, the description should speak about the research or researchers in first person. - - Use a minimum of 75 words and a maximum of 100 words. - + - Use a minimum of 75 words and a maximum of 100 words. + + Do not use markdown. Finally, do not exceed 100 words in the response. ```{0}``` """, - "summary": """ Generate a summary of the current research represented in the text delimited by triple backticks. - + The summary should meet the following criteria: - Should relay key findings and value. - - The summary should be still accessible to the non-specialist but may be more technical if necessary. - - Do not mention the names of institutions. - - If there is a United States Department of Energy Office of Science user facility involved, such as NERSC, you can mention the user facility. + - The summary should be still accessible to the non-specialist but may be more technical if necessary. + - Do not mention the names of institutions. + - If there is a United States Department of Energy Office of Science user facility involved, such as NERSC, you can mention the user facility. - Should be 1 or 2 paragraphs detailing the research. - Use present tense. - In general, the description should speak about the research or researchers in first person. - Use no more than 200 words. - Return only the summary. - + + Do not use markdown. Finally, do not exceed 200 words in the response. - + ```{0}``` """, - "figure": """ Generate a list of 5 search strings for use in the website that hosts free stock photos \ (e.g., https://www.pexels.com/) that would be representative of an aspect of the following research statement \ - delimited by triple backticks. + delimited by triple backticks. ```{0}``` """, - "caption": """ Generate a caption for a photograph describing an aspect of the research for the text delimited by triple backticks. - + The caption should meet the following criteria: - The caption should be greater than 200 characters and less than 255 character long. ```{0}``` """, - "objective": """ Generate one sentence stating the core purpose of the study. @@ -155,28 +153,26 @@ Here is the input text delimited by triple backticks: ```{2}``` """, - - "approach": """Clearly and concisely state in 2-3 short points how this work accomplished the stated objective from a methodolgocial perspecive. + "approach": """Clearly and concisely state in 2-3 short points how this work accomplished the stated objective from a methodolgocial perspecive. - Do not restate the objective or include results. - - Only include methodology including but not limited to: statistical, technological, and theory based approaches. - - Here is the objective statement: {1} + - Only include methodology including but not limited to: statistical, technological, and theory based approaches. + - Here is the objective statement: {1} - Use a different action verb to start sentences than what is used to begin the objective statement. - - Use active verbs for the start of each point. + - Use active verbs for the start of each point. - Use present tense. - Format the results as a hyphen-separated list. - The response must be in a hyphen-separated list. - + # Example response for the purposes of formatting RESPONSE: - Evaluate contemporary and hypothesized Western U.S. infrastructures with variable renewable generation shares for sensitivity to drought and Southern California heat wave scenarios on generation and load. - Use a stochastic temperature simulation combined with spatially resolved historical drought as a toolset to incorporate other grid stressors in high-resolution power system models, leading to improved sensitivity analyses not limited by the current ability of climate models to capture extreme conditions. - + # Input to process TEXT: {0} RESPONSE: """, - - "ppt_impact": """Clearly and concisely state in 3 bullet points the key results and outcomes from this research. + "ppt_impact": """Clearly and concisely state in 3 bullet points the key results and outcomes from this research. - State what the results indicate. - Include results that may be considered profound or surprising. - Each point should be 1 concise sentence. @@ -195,20 +191,17 @@ TEXT: {0} RESPONSE: """, - "reduce_wordcount": """Reduce the current text to be greater than {0} words and less than or equal to {1} words. \ The following is the text delimited by triple backquotes: - + ```{2}``` """, - "figure_caption": """Summarize the key findings of the paper as a figure caption. - Limit the response to 25 words. - + TEXT: {0} RESPONSE: """, - "figure_choice": """What figure from the results of the paper best represents the high impact content that can be easily understood by a non-technical, non-scientifc audience. Limit the response to: 1. The figure name as it is written in the text, @@ -218,18 +211,16 @@ TEXT: {0} RESPONSE: """, + "citation": """Generate the citation for this publication in Chicago style. + Do not use the example directly. - "citation": """Generate the citation for this publication in Chicago style. - Do not use the example directly. - # Example: Hadjimichael, A., J. Yoon, P. Reed, N. Voisin, W. Xu. 2023. “Exploring the Consistency of Water Scarcity Inferences between Large-Scale Hydrologic and Node-Based Water System Model Representations of the Upper Colorado River Basin,” J. Water Resour. Plann. Manage., 149(2): 04022081. DOI: 10.1061/JWRMD5.WRENG-5522 TEXT: {0} RESPONSE: """, - - "funding": """Extract the funding statement from the content provided. + "funding": """Extract the funding statement from the content provided. Only provide the funding statment. Do not provide header content like **Funding Statement:**. TEXT: {0} diff --git a/highlight/utils.py b/highlight/utils.py index 452df90..c5a049c 100644 --- a/highlight/utils.py +++ b/highlight/utils.py @@ -5,6 +5,14 @@ import highlight.prompts as prompts +from pathlib import Path +from typing import Union + +try: + import tomllib +except ImportError: + import tomli as tomllib + def get_token_count(text, model="gpt-4o"): """ @@ -25,6 +33,21 @@ def get_token_count(text, model="gpt-4o"): return n_text_tokens +def read_config(config_file: Union[str, Path]) -> dict: + """ + Read the configuration file and return its contents as a dictionary. + + Args: + config_file (str | Path): The path to the configuration file. + + Returns: + dict: The contents of the configuration file as a dictionary. + """ + with open(config_file, "rb") as cf: + config = tomllib.load(cf) + return config + + def read_pdf(file_object: object, reference_indicator: str = "References\n") -> dict: """ Extract text content from a PDF file until a specified reference indicator is encountered. @@ -68,7 +91,7 @@ def read_pdf(file_object: object, reference_indicator: str = "References\n") -> "n_pages": n_pages, "n_characters": len(content), "n_words": len(content.split(" ")), - "n_tokens": get_token_count(content) + "n_tokens": get_token_count(content), } @@ -87,23 +110,18 @@ def read_text(file_object: object) -> dict: - n_words (int): The number of words in the extracted content. - n_tokens (int): The number of tokens in the extracted content. """ - content = bytes.decode(file_object.read(), 'utf-8') + content = bytes.decode(file_object.read(), "utf-8") return { "content": content, "n_pages": 1, "n_characters": len(content), "n_words": len(content.replace("\n", " ").split()), - "n_tokens": get_token_count(content) + "n_tokens": get_token_count(content), } -def content_reduction( - client, - document_list, - system_scope, - model -): +def content_reduction(client, document_list, system_scope, model): """ Reduce the input text by removing irrelevant content. @@ -125,18 +143,15 @@ def content_reduction( page_tokens = get_token_count(page_content) messages = [ - {"role": "system", "content": system_scope}, - {"role": "user", "content": prompt.format(text=page_content)} + ("system", system_scope), + ("user", prompt.format(text=page_content)), ] - response = client.chat.completions.create( - model=model, - max_tokens=page_tokens, - temperature=0.0, - messages=messages - ) + response = client.with_config( + configurable={"model": model, "max_tokens": page_tokens, "temperature": 0.0} + ).invoke(messages) - content += response.choices[0].message.content + content += response.content return content @@ -148,7 +163,7 @@ def generate_prompt_content( max_tokens=50, temperature=0.0, max_allowable_tokens=8192, - model="gpt-4o" + model="gpt-4o", ): """ Generate content using the OpenAI API based on the provided prompt and parameters. @@ -172,24 +187,27 @@ def generate_prompt_content( n_prompt_tokens = get_token_count(prompt) + max_tokens if n_prompt_tokens > max_allowable_tokens: - raise RuntimeError(( - f"ERROR: input text tokens needs to be reduced due to exceeding the maximum ", - " allowable tokens per prompt by {n_prompt_tokens - max_allowable_tokens} tokens." - )) + raise RuntimeError( + ( + f"ERROR: input text tokens needs to be reduced due to exceeding the maximum ", + " allowable tokens per prompt by {n_prompt_tokens - max_allowable_tokens} tokens.", + ) + ) messages = [ - {"role": "system", "content": system_scope}, - {"role": "user", "content": prompt} + ("system", system_scope), + ("user", prompt), ] - response = client.chat.completions.create( - model=model, - max_tokens=max_tokens, - temperature=temperature, - messages=messages - ) + response = client.with_config( + configurable={ + "model": model, + "max_tokens": max_tokens, + "temperature": temperature, + } + ).invoke(messages) - content = response.choices[0].message.content + content = response.content return content @@ -207,7 +225,7 @@ def generate_content( max_word_count=100, min_word_count=75, max_allowable_tokens: int = 150000, - model="gpt-4o" + model="gpt-4o", ): """ Generate content using the OpenAI API based on the provided parameters and display it in a Streamlit container. @@ -238,7 +256,7 @@ def generate_content( max_tokens=max_tokens, max_allowable_tokens=max_allowable_tokens, additional_content=additional_content, - model=model + model=model, ) container.markdown(result_title) @@ -248,27 +266,30 @@ def generate_content( if word_count > max_word_count: # construct word count reduction prompt - reduction_prompt = prompts.prompt_queue["reduce_wordcount"].format(min_word_count, max_word_count, response) + reduction_prompt = prompts.prompt_queue["reduce_wordcount"].format( + min_word_count, max_word_count, response + ) messages = [ - {"role": "system", "content": prompts.prompt_queue["system"]}, - {"role": "user", "content": reduction_prompt} + ("system", prompts.prompt_queue["system"]), + ("user", reduction_prompt), ] - reduced_response = client.chat.completions.create( - model=model, - max_tokens=max_tokens, - temperature=temperature, - messages=messages - ) + reduced_response = client.with_config( + configurable={ + "model": model, + "max_tokens": max_tokens, + "temperature": temperature, + } + ).invoke(messages) - response = reduced_response.choices[0].message.content + response = reduced_response.content container.text_area( label=result_title, value=response, label_visibility="collapsed", - height=box_height + height=box_height, ) st.write(f"Word count: {len(response.split())}") @@ -284,7 +305,7 @@ def generate_prompt( max_allowable_tokens: int = 150000, temperature: float = 0.0, additional_content: str = None, - model: str = "gpt-4" + model: str = "gpt-4", ) -> str: """ Generate a prompt using the provided parameters and the prompt queue. @@ -305,39 +326,31 @@ def generate_prompt( if prompt_name in ("objective",): prompt = prompts.prompt_queue[prompt_name].format( - prompts.EXAMPLE_TEXT_ONE, - prompts.EXAMPLE_TEXT_TWO, - content + prompts.EXAMPLE_TEXT_ONE, prompts.EXAMPLE_TEXT_TWO, content ) elif prompt_name in ("approach",): if additional_content is None: additional_content = content - prompt = prompts.prompt_queue[prompt_name].format( - content, - additional_content - ) + prompt = prompts.prompt_queue[prompt_name].format(content, additional_content) elif prompt_name in ("subtitle",): if additional_content is None: additional_content = content - prompt = prompts.prompt_queue[prompt_name].format( - content, - additional_content - ) + prompt = prompts.prompt_queue[prompt_name].format(content, additional_content) elif prompt_name in ( - "figure", - "caption", - "impact", - "summary", - "title", - "science", + "figure", + "caption", + "impact", + "summary", + "title", + "science", "ppt_impact", - "figure_caption", - "figure_choice", + "figure_caption", + "figure_choice", "citation", - "funding" + "funding", ): prompt = prompts.prompt_queue[prompt_name].format(content) @@ -348,5 +361,5 @@ def generate_prompt( max_tokens=max_tokens, temperature=temperature, max_allowable_tokens=max_allowable_tokens, - model=model + model=model, ) diff --git a/pyproject.toml b/pyproject.toml index 2bf3b17..e807c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,11 +24,14 @@ dependencies = [ 'pypdf>=3.0.1', 'tiktoken>=0.7.0', 'tqdm>=4.66.1', + 'langchain_openai>=0.2.2', + "tomli; python_version < '3.11'" ] classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] [project.optional-dependencies] @@ -40,6 +43,25 @@ test = [ deploy = [ "twine>=4.0.1", ] +ollama = [ + "langchain_ollama>=0.2.0" +] + +dartmouth = [ + "langchain_dartmouth>=0.2.11; python_version > '3.9'" +] + +anthropic = [ + "langchain_anthropic>=0.2.3" +] + +google = [ + "langchain-google-genai>=2.0.1" +] + +mistral = [ + "langchain_mistralai>=0.2.0" +] [project.urls] Repository = "https://github.com/crvernon/highlight" diff --git a/tests/test-config.toml b/tests/test-config.toml new file mode 100644 index 0000000..2c12e3d --- /dev/null +++ b/tests/test-config.toml @@ -0,0 +1,3 @@ +[llm] +[llm.openai] +models = ["gpt-4o", "gpt-4", "gpt-3.5-turbo-16k", "gpt-3.5-turbo"] \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 3af5c80..0ef9c7a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import patch from io import BytesIO +from pathlib import Path import tiktoken from pypdf import PdfWriter, PdfReader @@ -26,6 +27,26 @@ def test_get_token_count_empty_text(self): self.assertEqual(hlt.get_token_count(text), expected_token_count) +class TestReadConfig(unittest.TestCase): + def test_read_config(self): + config = hlt.read_config(Path(__file__).parent.resolve() / "test-config.toml") + self.assertDictEqual( + { + "llm": { + "openai": { + "models": [ + "gpt-4o", + "gpt-4", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo", + ] + } + } + }, + config, + ) + + class TestReadPdf(unittest.TestCase): def test_read_pdf_without_reference_indicator(self): # Create a sample PDF file using pypdf @@ -44,7 +65,9 @@ def test_read_pdf_without_reference_indicator(self): buffer.seek(0) # Mock the PdfReader's extract_text method to return "Hello World" - with patch.object(PdfReader, 'pages', new_callable=unittest.mock.PropertyMock) as mock_pages: + with patch.object( + PdfReader, "pages", new_callable=unittest.mock.PropertyMock + ) as mock_pages: mock_page = unittest.mock.Mock() mock_page.extract_text.return_value = "Hello World" mock_pages.return_value = [mock_page] @@ -64,7 +87,7 @@ class TestReadText(unittest.TestCase): def test_read_text(self): # Simulate a text file using BytesIO sample_text = "Hello World!\nThis is a test file." - text_file = BytesIO(sample_text.encode('utf-8')) + text_file = BytesIO(sample_text.encode("utf-8")) result = hlt.read_text(text_file) @@ -72,4 +95,8 @@ def test_read_text(self): self.assertEqual(result["content"], sample_text) self.assertEqual(result["n_pages"], 1) self.assertEqual(result["n_characters"], len(sample_text)) - self.assertEqual(result["n_words"], 7) + self.assertEqual(result["n_words"], 7) + + +if __name__ == "__main__": + unittest.main()