92f19ac98a00ae5f25b4824ccbd505bc2f32e1c7,google/datalab/contrib/mltoolbox/commands/_ml.py,,_transform,#Any#Any#,410

Before Change




def _transform(args, cell):
  env = google.datalab.utils.commands.notebook_environment()
  cell_data = google.datalab.utils.commands.parse_config(cell, env)
  google.datalab.utils.commands.validate_config(cell_data,
                                                required_keys=["training_data"],
                                                optional_keys=["cloud"])
  training_data = cell_data["training_data"]
  cmd_args = ["python", "transform.py",
              "--output", _abs_path(args["output"]),
              "--analysis", _abs_path(args["analysis"]),
              "--prefix", args["prefix"]]
  if args["cloud"]:
    cmd_args.append("--cloud")
    cmd_args.append("--async")
  if args["shuffle"]:
    cmd_args.append("--shuffle")
  if args["batch_size"]:
    cmd_args.extend(["--batch-size", str(args["batch_size"])])

  if isinstance(training_data, dict):
    if "csv" in training_data:
      cmd_args.extend(["--csv", _abs_path(training_data["csv"])])
    elif "bigquery_table" in training_data:
      cmd_args.extend(["--bigquery", training_data["bigquery_table"]])
    elif "bigquery_sql" in training_data:
        // see https://cloud.google.com/bigquery/querying-data//temporary_and_permanent_tables
        print("Creating temporary table that will be deleted in 24 hours")
        r = bq.Query(training_data["bigquery_sql"]).execute().result()
        cmd_args.extend(["--bigquery", r.full_name])
    else:
      raise ValueError("Invalid training_data dict. " +
                       "Requires either "csv" and "schema", or "bigquery".")
  elif isinstance(training_data, google.datalab.ml.CsvDataSet):
    for file_name in training_data.input_files:
      cmd_args.append("--csv=" + _abs_path(file_name))
  elif isinstance(training_data, google.datalab.ml.BigQueryDataSet):
    cmd_args.extend(["--bigquery", training_data.table])
  else:
    raise ValueError("Invalid training data. Requires either a dict, " +
                     "a google.datalab.ml.CsvDataSet, or a google.datalab.ml.BigQueryDataSet.")

  if "cloud" in cell_data:
    cloud_config = cell_data["cloud"]
    google.datalab.utils.commands.validate_config(
        cloud_config,
        required_keys=[],
        optional_keys=["num_workers", "worker_machine_type", "project_id"])

After Change


    raise ValueError("Invalid training data. Requires either a dict, " +
                     "a google.datalab.ml.CsvDataSet, or a google.datalab.ml.BigQueryDataSet.")

  cloud_config = args["cloud_config"]
  if cloud_config:
    google.datalab.utils.commands.validate_config(
        cloud_config,
        required_keys=[],
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 17

Instances


Project Name: googledatalab/pydatalab
Commit Name: 92f19ac98a00ae5f25b4824ccbd505bc2f32e1c7
Time: 2017-07-03
Author: qimingj@users.noreply.github.com
File Name: google/datalab/contrib/mltoolbox/commands/_ml.py
Class Name:
Method Name: _transform


Project Name: googledatalab/pydatalab
Commit Name: 92f19ac98a00ae5f25b4824ccbd505bc2f32e1c7
Time: 2017-07-03
Author: qimingj@users.noreply.github.com
File Name: google/datalab/contrib/mltoolbox/commands/_ml.py
Class Name:
Method Name: _train


Project Name: googledatalab/pydatalab
Commit Name: 92f19ac98a00ae5f25b4824ccbd505bc2f32e1c7
Time: 2017-07-03
Author: qimingj@users.noreply.github.com
File Name: google/datalab/contrib/mltoolbox/commands/_ml.py
Class Name:
Method Name: _transform


Project Name: googledatalab/pydatalab
Commit Name: 92f19ac98a00ae5f25b4824ccbd505bc2f32e1c7
Time: 2017-07-03
Author: qimingj@users.noreply.github.com
File Name: google/datalab/contrib/mltoolbox/commands/_ml.py
Class Name:
Method Name: _batch_predict