Skip to content

gtfs_segments.utils

download_write_file(url, folder_path)

It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the file from the URL, and writes the file to the folder path

Parameters:

Name Type Description Default
url str

The URL of the GTFS file you want to download

required
folder_path str

The path to the folder where you want to save the GTFS file.

required

Returns:

Type Description
str

The location of the file that was downloaded.

Source code in gtfs_segments/utils.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def download_write_file(url: str, folder_path: str) -> str:
    """
    It takes a URL and a folder path as input, creates a new folder if it does not exist, downloads the
    file from the URL, and writes the file to the folder path

    Args:
      url: The URL of the GTFS file you want to download
      folder_path: The path to the folder where you want to save the GTFS file.

    Returns:
      The location of the file that was downloaded.
    """
    # Create a new directory if it does not exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    # Download file from URL
    gtfs_file_loc = os.path.join(folder_path, "gtfs.zip")

    try:
        r = requests.get(url, allow_redirects=True, timeout=300)
        # Write file locally
        file = open(gtfs_file_loc, "wb")
        file.write(r.content)
        file.close()
    except requests.exceptions.RequestException as e:
        print(e)
        raise ValueError(f"Failed to download {url}") from e
    return gtfs_file_loc

export_segments(df, file_path, output_format, geometry=True)

This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value for whether or not to include the geometry in the output.

If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.

If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the geometry boolean is set to True, the function will output the CSV file with the geometry column. If the geometry boolean is set to False, the function will output the CSV file without the geometry column.

The function will also add additional columns to the CSV file, including the start and end points of the segments, the start and end longitude and latitude of the segments, and the distance of the segments.

The function will also add a column to the CSV file that indicates the number of times the segment was traversed.

Parameters:

Name Type Description Default
df DataFrame

the dataframe containing the segments

required
file_path str

The path to the file you want to export to.

required
output_format str

geojson or csv

required
[Optional] geometry

If True, the output will include the geometry of the segments. If False, the output will

required

only include the start and end points of the segments. Defaults to True

Source code in gtfs_segments/utils.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def export_segments(
    df: pd.DataFrame, file_path: str, output_format: str, geometry: bool = True
) -> None:
    """
    This function takes a GeoDataFrame of segments, a file path, an output format, and a boolean value
    for whether or not to include the geometry in the output.

    If the output format is GeoJSON, the function will output the GeoDataFrame to a GeoJSON file.

    If the output format is CSV, the function will output the GeoDataFrame to a CSV file. If the
    geometry boolean is set to True, the function will output the CSV file with the geometry column. If
    the geometry boolean is set to False, the function will output the CSV file without the geometry
    column.

    The function will also add additional columns to the CSV file, including the start and end points of
    the segments, the start and end longitude and latitude of the segments, and the distance of the
    segments.

    The function will also add a column to the CSV file that indicates the number of times the segment
    was traversed.

    Args:
      df: the dataframe containing the segments
      file_path: The path to the file you want to export to.
      output_format: geojson or csv
      [Optional] geometry: If True, the output will include the geometry of the segments. If False, the output will
    only include the start and end points of the segments. Defaults to True
    """
    # Output to GeoJSON
    if output_format == "geojson":
        df.to_file(file_path, driver="GeoJSON")
    elif output_format == "csv":
        s_df = df.copy()
        geom_list = s_df.geometry.apply(lambda g: np.array(g.coords))
        s_df["start_point"] = [Point(g[0]).wkt for g in geom_list]
        s_df["end_point"] = [Point(g[-1]).wkt for g in geom_list]
        sg_df = s_df.copy()
        s_df["start_lon"] = [g[0][0] for g in geom_list]
        s_df["start_lat"] = [g[0][1] for g in geom_list]
        s_df["end_lon"] = [g[-1][0] for g in geom_list]
        s_df["end_lat"] = [g[-1][1] for g in geom_list]
        if geometry:
            # Output With LS
            sg_df.to_csv(file_path, index=False)
        else:
            d_df = s_df.drop(columns=["geometry", "start_point", "end_point"])
            # Output without LS
            d_df.to_csv(file_path, index=False)

failed_pipeline(message, filename, folder_path)

"If the folder path exists, delete it and return the failure message."

Parameters:

Name Type Description Default
message str

The message to be returned

required
filename str

The name of the file that is being processed

required
folder_path str

The path to the folder where the file is located

required

Returns:

Type Description
str

a string that is the concatenation of the message and the filename, indicating failure

Source code in gtfs_segments/utils.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def failed_pipeline(message: str, filename: str, folder_path: str) -> str:
    """
    "If the folder path exists, delete it and return the failure message."

    Args:
      message: The message to be returned
      filename: The name of the file that is being processed
      folder_path: The path to the folder where the file is located

    Returns:
      a string that is the concatenation of the message and the filename, indicating failure
    """

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    return message + " : " + filename

plot_hist(df, save_fig=False, show_mean=False, **kwargs)

It takes a dataframe with two columns, one with the distance between stops and the other with the number of traversals between those stops, and plots a weighted histogram of the distances

Parameters:

Name Type Description Default
df DataFrame

The dataframe that contains the data

required
save_fig bool

If True, the figure will be saved to the file_path. Defaults to False

False
show_mean bool

If True, will show the mean of the distribution. Defaults to False

False

Returns:

Type Description
Figure

A matplotlib axis

Source code in gtfs_segments/utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def plot_hist(
    df: pd.DataFrame, save_fig: bool = False, show_mean: bool = False, **kwargs: Any
) -> plt.Figure:
    """
    It takes a dataframe with two columns, one with the distance between stops and the other with the
    number of traversals between those stops, and plots a weighted histogram of the distances

    Args:
      df: The dataframe that contains the data
      save_fig: If True, the figure will be saved to the file_path. Defaults to False
      show_mean: If True, will show the mean of the distribution. Defaults to False

    Returns:
      A matplotlib axis
    """
    if "max_spacing" not in kwargs.keys():
        max_spacing = 3000
        print("Using max_spacing = 3000")
    else:
        max_spacing = kwargs["max_spacing"]
    if "ax" in kwargs.keys():
        ax = kwargs["ax"]
    else:
        fig, ax = plt.subplots(figsize=(8, 6))
    df = df[df["distance"] < max_spacing]
    data = np.hstack([np.repeat(x, y) for x, y in zip(df["distance"], df.traversals)])
    plt.hist(
        data,
        range=(0, max_spacing),
        density=True,
        bins=int(max_spacing / 50),
        fc=(0, 105 / 255, 160 / 255, 0.4),
        ec="white",
        lw=0.8,
    )
    x = np.arange(0, max_spacing, 5)
    plt.plot(x, gaussian_kde(data)(x), lw=1.5, color=(0, 85 / 255, 120 / 255, 1))
    # sns.histplot(data,binwidth=50,stat = "density",kde=True,ax=ax)
    plt.xlim([0, max_spacing])
    plt.xlabel("Stop Spacing [m]")
    plt.ylabel("Density - Traversal Weighted")
    plt.title("Histogram of Spacing")
    if show_mean:
        plt.axvline(np.mean(data), color="k", linestyle="dashed", linewidth=2)
        _, max_ylim = plt.ylim()
        plt.text(
            np.mean(data) * 1.1,
            max_ylim * 0.9,
            "Mean: {:.0f}".format(np.mean(data)),
            fontsize=12,
        )
    if "title" in kwargs.keys():
        plt.title(kwargs["title"])
    if save_fig:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        plt.savefig(kwargs["file_path"], dpi=300)
    plt.close()
    return fig

process(pipeline_gtfs, row, max_spacing)

It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the pipeline

Parameters:

Name Type Description Default
pipeline_gtfs Any

This is the function that will be used to process the GTFS data.

required
row Series

This is a row in the sources_df dataframe. It contains the name of the provider, the url to the gtfs file, and the bounding box of the area that the gtfs file covers.

required
max_spacing float

Maximum Allowed Spacing between two consecutive stops.

required

Returns:

Type Description
Any

The return value is a tuple of the form (filename,folder_path,df)

Source code in gtfs_segments/utils.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def process(pipeline_gtfs: Any, row: pd.core.series.Series, max_spacing: float) -> Any:
    """
    It takes a pipeline, a row from the sources_df, and a max_spacing, and returns the output of the
    pipeline

    Args:
        pipeline_gtfs: This is the function that will be used to process the GTFS data.
        row: This is a row in the sources_df dataframe. It contains the name of the provider, the url to
            the gtfs file, and the bounding box of the area that the gtfs file covers.
        max_spacing: Maximum Allowed Spacing between two consecutive stops.

    Returns:
        The return value is a tuple of the form (filename,folder_path,df)
    """
    filename = row["provider"]
    url = row["urls.latest"]
    bounds = [
        [row["minimum_longitude"], row["minimum_latitude"]],
        [row["maximum_longitude"], row["maximum_latitude"]],
    ]
    print(filename)
    try:
        return pipeline_gtfs(filename, url, bounds, max_spacing)
    except Exception as e:
        traceback.print_exc()
        raise ValueError(f"Failed for {filename}") from e

summary_stats(df, max_spacing=3000, min_spacing=10, export=False, **kwargs)

It takes in a dataframe, and returns a dataframe with summary statistics. The max_spacing and min_spacing serve as threshold to remove outliers.

Parameters:

Name Type Description Default
df DataFrame

The dataframe that you want to get the summary statistics for.

required
max_spacing float

The maximum spacing between two stops. Defaults to 3000[m]

3000
min_spacing float

The minimum spacing between two stops. Defaults to 10[m]

10
export bool

If True, the summary will be exported to a csv file. Defaults to False

False

Returns:

Type Description
DataFrame

A dataframe with the summary statistics

Source code in gtfs_segments/utils.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def summary_stats(
    df: pd.DataFrame, max_spacing: float = 3000, min_spacing: float = 10, export: bool = False, **kwargs: Any
) -> pd.DataFrame:
    """
    It takes in a dataframe, and returns a dataframe with summary statistics.
    The max_spacing and min_spacing serve as threshold to remove outliers.

    Args:
      df: The dataframe that you want to get the summary statistics for.
      max_spacing: The maximum spacing between two stops. Defaults to 3000[m]
      min_spacing: The minimum spacing between two stops. Defaults to 10[m]
      export: If True, the summary will be exported to a csv file. Defaults to False

    Returns:
      A dataframe with the summary statistics
    """
    print("Using max_spacing = ", max_spacing)
    print("Using min_spacing = ", min_spacing)
    percent_spacing = round(
        df[df["distance"] > max_spacing]["traversals"].sum() / df["traversals"].sum() * 100,
        3,
    )
    df = df[(df["distance"] <= max_spacing) & (df["distance"] >= min_spacing)]
    seg_weighted_mean = (
        df.groupby(["segment_id", "distance"]).first().reset_index()["distance"].mean()
    )
    seg_weighted_median = (
        df.groupby(["segment_id", "distance"]).first().reset_index()["distance"].median()
    )
    route_weighted_mean = (
        df.groupby(["route_id", "segment_id", "distance"]).first().reset_index()["distance"].mean()
    )
    route_weighted_median = (
        df.groupby(["route_id", "segment_id", "distance"])
        .first()
        .reset_index()["distance"]
        .median()
    )
    weighted_data = np.hstack([np.repeat(x, y) for x, y in zip(df["distance"], df.traversals)])

    df_dict = {
        "Segment Weighted Mean": np.round(seg_weighted_mean, 2),
        "Route Weighted Mean": np.round(route_weighted_mean, 2),
        "Traversal Weighted Mean": np.round(np.mean(weighted_data), 3),
        "Segment Weighted Median": np.round(seg_weighted_median, 2),
        "Route Weighted Median": np.round(route_weighted_median, 2),
        "Traversal Weighted Median": np.round(np.median(weighted_data), 2),
        "Traversal Weighted Std": np.round(np.std(weighted_data), 3),
        "Traversal Weighted 25 % Quantile": np.round(np.quantile(weighted_data, 0.25), 3),
        "Traversal Weighted 50 % Quantile": np.round(np.quantile(weighted_data, 0.50), 3),
        "Traversal Weighted 75 % Quantile": np.round(np.quantile(weighted_data, 0.75), 3),
        "No of Segments": int(len(df.segment_id.unique())),
        "No of Routes": int(len(df.route_id.unique())),
        "No of Traversals": int(sum(df.traversals)),
        "Max Spacing": int(max_spacing),
        "% Segments w/ spacing > max_spacing": percent_spacing,
    }
    summary_df = pd.DataFrame([df_dict])
    # df.set_index(summary_df.columns[0],inplace=True)
    if export:
        assert "file_path" in kwargs.keys(), "Please pass in the `file_path`"
        summary_df.to_csv(kwargs["file_path"], index=False)
        print("Saved the summary in " + kwargs["file_path"])
    summary_df = summary_df.T
    return summary_df