Creating a df based on total permutations deriving from user-input variables

Question:

I would like to pass ‘n’ amount of cities to travel to and corresponding days in each city to a function that would return a df with all possible permutations of the journey. The kayak_search_url column in the df should contain this string in the first row:

https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/WAW-BOG,nearby/2023-02-17/BOG-MIL,nearby/2023-02-20/MIL-SDQ,nearby/2023-02-23/SDQ-AMS,nearby/2023-02-25/?sort=bestflight_a

…but instead contains this string:

https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/AMS-BOG,nearby/2023-02-17/AMS-MIL,nearby/2023-02-20/AMS-SDQ,nearby/2023-02-23/AMS,nearby/2023-02-25/?sort=bestflight_a

I can’t figure out why the origin code ‘AMS’ shows up instead of the chain of cities. Here’s the code:

# List the cities you want to travel to and from, how long you'd like to stay in each, and the appropriate start/end dates
start_city = 'Amsterdam'
end_city = 'Amsterdam'
start_date = '2023-02-14'

cities = ['Warsaw', 'Bogota', 'Milan', 'Santo Domingo']
days = [3,3,3,2]

def generate_permutations(cities, days, start_city, end_city, start_date):
    city_to_days = dict(zip(cities, days))
    
    permutations = list(itertools.permutations(cities))
    df = pd.DataFrame(permutations, columns=['city' + str(i) for i in range(1, len(cities) + 1)])
    df['origin'] = start_city
    df['end'] = end_city
    first_column = df.pop('origin')
    df.insert(0, 'origin', first_column)
    
    st_dt = pd.to_datetime(start_date)
    df = df.assign(flight_dt_1=st_dt)
    
    for i in range(len(cities)):
        df['flight_dt_' + str(i + 2)] = df['flight_dt_' + str(i + 1)] + df['city' + str(i + 1)].map(city_to_days).map(lambda x: pd.Timedelta(days=x))
    
    # IATA city code dictionary from iata_code.csv file in repo and create Kayak 'url' column for each permutation
    iata = {'Amsterdam': 'AMS',
            'Warsaw': 'WAW',
            'Bogota': 'BOG',
            'Milan': 'MIL',
            'Santo Domingo': 'SDQ'}

    url = 'https://www.kayak.com/flights/'
    df['kayak_search_url'] = df.apply(lambda x: url + ''.join([iata[x['origin']] + '-' + iata[x['city' + str(i+1)]] + 
                                                               ',nearby/' + str(x['flight_dt_' + str(i+1)].strftime("%Y-%m-%d")) + '/' 
                                                               for i in range(len(cities))]) + iata[x['end']] + ',nearby/' + str(x['flight_dt_' + str(len(cities) + 1)].strftime("%Y-%m-%d")) + 
                                                               '/?sort=bestflight_a', axis=1)
    
    return df
Asked By: June Smith

||

Answers:

Let’s break down the desired URL to highlight its structure:

https://www.kayak.com/flights
    /AMS-WAW,nearby/2023-02-14
    /WAW-BOG,nearby/2023-02-17
    /BOG-MIL,nearby/2023-02-20
    /MIL-SDQ,nearby/2023-02-23
    /SDQ-AMS,nearby/2023-02-25
    /?sort=bestflight_a

Obviously only the middle section needs to generated as the other parts are static. We can also generate that middle section before constructing the dataframe:

def generate_permutations(cities, days, start_city, end_city, start_date):
    iata = {
        "Amsterdam": "AMS",
        "Warsaw": "WAW",
        "Bogota": "BOG",
        "Milan": "MIL",
        "Santo Domingo": "SDQ",
    }

    permutations = [
        (start_city,) + p + (end_city,) for p in itertools.permutations(cities)
    ]
    flight_dates = pd.to_datetime(start_date) + pd.to_timedelta(
        np.array([0] + days).cumsum(),
        unit="D",
    )

    # Generate the URLs
    urls = []
    for p in permutations:
        # The pattern for each segment is
        #     START-END,nearby/yyyy-dd-dd
        mid_url = "/".join(
            [
                f"{iata[s]}-{iata[e]},nearby/{fd:%Y-%m-%d}"
                for s, e, fd in zip(p[:-1], p[1:], flight_dates)
            ]
        )
        urls.append(f"https://www.kayak.com/flights/{mid_url}/?sort=bestflight_a")

    # Generate the resulting dataframe
    return (
        pd.DataFrame(
            permutations,
            columns=["origin", *[f"city{i+1}" for i in range(len(cities))], "end"],
        )
        .merge(
            pd.DataFrame(
                flight_dates,
                index=[f"flight_dt_{i+1}" for i in range(len(flight_dates))],
            ).T,
            how="cross",
        )
        .assign(kayak_search_url=urls)
    )
Answered By: Code Different
Categories: questions Tags: , , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.