BART_LARGE_CNN
Download Flojoy Studio to try this app
  
 Take an input dataframe with multiple rows and a single column, then produce a dataframe with a single "summary_text" column. The "summary_text" column contains a summary of the text in the corresponding row of the input dataframe.  Params:    default : DataFrame  The text to summarize.     Returns:    out : DataFrame  dataframe containing the summary text in the "summary_text" column    
Python Code
from flojoy import flojoy, DataFrame
@flojoy(deps={"transformers": "4.30.2", "torch": "2.0.1", "torchvision": "0.15.2"})
def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
    """Take an input dataframe with multiple rows and a single column, then produce a dataframe with a single "summary_text" column.
    The "summary_text" column contains a summary of the text in the corresponding row of the input dataframe.
    Parameters
    ----------
    default : DataFrame
        The text to summarize.
    Returns
    -------
    DataFrame
        dataframe containing the summary text in the "summary_text" column
    """
    import torch
    from flojoy import snapshot_download
    from transformers import BartTokenizer, BartForConditionalGeneration
    import pandas as pd
    input_df = default.m
    assert (
        len(input_df.columns.tolist()) == 1
    ), "Can only take a single-column dataframe as input"
    # Load the repo from either the local cache or from the web, and get the local path
    local_path = snapshot_download(
        repo_id="facebook/bart-large-cnn", revision="3d22493"
    )
    # Load the pre-trained BART model
    model = BartForConditionalGeneration.from_pretrained(local_path)
    tokenizer = BartTokenizer.from_pretrained(local_path)
    def _chunk_text(text):
        inputs_no_trunc = tokenizer(
            text, max_length=None, return_tensors="pt", truncation=False
        )
        chunks = []
        step = 1024
        # step = tokenizer.model_max_length - 1
        for i in range(0, len(inputs_no_trunc["input_ids"][0]), step):
            chunk = inputs_no_trunc["input_ids"][0][i : i + step]
            chunks.append(torch.unsqueeze(chunk, 0))
        return chunks
    def _summarize_text(text):
        chunks = _chunk_text(text)
        summary_ids = [
            model.generate(
                chunk,
                num_beams=4,
                max_length=1024 // 2,
                early_stopping=True,
            )
            for chunk in chunks
        ]
        summaries = [
            "\n".join(
                [
                    tokenizer.decode(
                        g, skip_special_tokens=True, clean_up_tokenization_spaces=False
                    )
                    for g in id
                ]
            )
            for id in summary_ids
        ]
        return "\n".join(summaries)
    column = input_df.columns[0]
    with torch.inference_mode():
        output_df = pd.DataFrame(
            input_df[column].apply(_summarize_text).rename("summary_text")
        )
    return DataFrame(df=output_df)
Example App
Having problems with this example app? Join our Discord community and we will help you out!
In this example, a READ_CSV node reads out a single-column dataframe containing a large text as a single row.
Our BART_LARGE_CNN node produces an equivalent output dataframe containing the corresponding summaries.