0d078991bf6a056be5579cda14dd1f305078eb7d,tensorflow_transform/beam/cached_impl_test.py,CachedImplTest,test_caching_vocab_for_integer_categorical,#CachedImplTest#,566

Before Change


    }, {
        "x_vocab": -1,
    }]
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      with beam.Pipeline() as p:

        flat_data = p | "CreateInputData" >> beam.Create(
            list(itertools.chain(*input_data_dict.values())))

        cache_dict = {
            span_0_key: {
                b"__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9":
                    p | "CreateB" >> beam.Create(
                        [b"[-2, 2]", b"[-4, 1]", b"[-1, 1]", b"[4, 1]"]),
            },
            span_1_key: {},
        }

        transform_fn, cache_output = (
            (flat_data, input_data_dict, cache_dict, input_metadata)
            | "Analyze" >>
            (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)

        self.assertNotIn(span_0_key, cache_output)

        _ = cache_output | "WriteCache" >> analyzer_cache.WriteAnalysisCacheToFS(
            self._cache_dir)

        transformed_dataset = (((input_data_dict[span_1_key], input_metadata),
                                transform_fn)
                               | "Transform" >> beam_impl.TransformDataset())

        transformed_data, _ = transformed_dataset

        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label="first")

  def test_non_frequency_vocabulary_merge(self):
    This test compares vocabularies produced with and without cache.

    mi_vocab_name = "mutual_information_vocab"

After Change


    }, {
        "x_vocab": -1,
    }]
    with _TestPipeline() as p:
      flat_data = p | "CreateInputData" >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      cache_dict = {
          span_0_key: {
              b"__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9":
                  p | "CreateB" >> beam.Create(
                      [b"[-2, 2]", b"[-4, 1]", b"[-1, 1]", b"[4, 1]"]),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | "Analyze" >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph(
          [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
      self.WriteRenderedDotFile(dot_string)

      self.assertNotIn(span_0_key, cache_output)

      _ = cache_output | "WriteCache" >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | "Transform" >> beam_impl.TransformDataset())

      transformed_data, _ = transformed_dataset

      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label="first")

    // 4 from analysis since 1 span was completely cached, and 4 from transform.
    self.assertEqual(_get_counter_value(p.metrics, "num_instances"), 8)
    self.assertEqual(_get_counter_value(p.metrics, "cache_entries_decoded"), 1)
    self.assertEqual(_get_counter_value(p.metrics, "cache_entries_encoded"), 1)
    self.assertEqual(_get_counter_value(p.metrics, "saved_models_created"), 2)

In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 9

Instances

Link

Project Name: tensorflow/transform

Commit Name: 0d078991bf6a056be5579cda14dd1f305078eb7d

Time: 2019-04-29

Author: zoy@google.com

File Name: tensorflow_transform/beam/cached_impl_test.py

Class Name: CachedImplTest

Method Name: test_caching_vocab_for_integer_categorical

Link

Project Name: tensorflow/transform

Commit Name: 0d078991bf6a056be5579cda14dd1f305078eb7d

Time: 2019-04-29

Author: zoy@google.com

File Name: tensorflow_transform/beam/cached_impl_test.py

Class Name: CachedImplTest

Method Name: test_single_phase_mixed_analyzer_run_once

Link

Project Name: tensorflow/transform

Commit Name: 0d078991bf6a056be5579cda14dd1f305078eb7d

Time: 2019-04-29

Author: zoy@google.com

File Name: tensorflow_transform/beam/cached_impl_test.py

Class Name: CachedImplTest

Method Name: test_single_phase_run_twice