Python airflow.DAG Examples

The following are 30 code examples of airflow.DAG(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module airflow , or try the search function .
Example #1
Source File: rawpixel_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        start_date=datetime(2020, 1, 15),
        schedule_interval="@monthly",
        catchup=False
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, "Starting")
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, "Finished")

        start_task >> run_task >> end_task

    return dag 
Example #2
Source File: dbt_example.py    From Example-Airflow-DAGs with Apache License 2.0 6 votes vote down vote up
def dbt_dag(start_date, schedule_interval, default_args):
    temp_dag = DAG('gospel_.dbt_sub_dag', start_date=start_date, schedule_interval=schedule_interval, default_args=default_args)
    G = nx.read_gpickle('/home/airflowuser/project/graph.gpickle')

    def make_dbt_task(model_name):
        simple_model_name = model_name.split('.')[-1]
        dbt_task = BashOperator(
                    task_id=model_name,
                    bash_command='cd ~/gospel && dbt run  --profile=warehouse --target=prod --non-destructive --models {simple_model_name}'.format(simple_model_name=simple_model_name),
                    dag=temp_dag
                    )
        return dbt_task


    dbt_tasks = {}
    for node_name in set(G.nodes()):
        dbt_task = make_dbt_task(node_name)
        dbt_tasks[node_name] = dbt_task

    for edge in G.edges():
        dbt_tasks[edge[0]].set_downstream(dbt_tasks[edge[1]])
    return temp_dag 
Example #3
Source File: phylopic_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        concurrency=1,
        max_active_runs=1,
        start_date=datetime(2011, 1, 1),
        schedule_interval='@daily',
        catchup=False,
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> run_task >> end_task

    return dag 
Example #4
Source File: factory.py    From starthinker with Apache License 2.0 6 votes vote down vote up
def print_commandline(self):

    print('')
    print('DAG: %s' % self.dag_name)
    print('')
 
    instances = {}
    for task in self.recipe['tasks']:
      function = next(iter(task.keys()))

      # count instance per task
      instances.setdefault(function, 0)
      instances[function] += 1

      print('airflow test "%s" %s_%d %s' % (self.dag_name, function, instances[function], str(date.today())))
      print('') 
Example #5
Source File: test_operator_util.py    From cccatalog with MIT License 6 votes vote down vote up
def test_get_dated_main_runner_handles_zero_shift():
    dag = DAG(
        dag_id='test_dag',
        start_date=datetime.strptime('2019-01-01', '%Y-%m-%d')
    )
    execution_date = datetime.strptime(
        '2019-01-01',
        '%Y-%m-%d'
    ).replace(tzinfo=timezone.utc)
    main_func = PickleMock()
    runner = op_util.get_dated_main_runner_operator(
        dag,
        main_func,
        timedelta(minutes=1)
    )
    ti = TaskInstance(runner, execution_date)
    ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True)
    main_func.assert_called_with('2019-01-01') 
Example #6
Source File: test_serialized_dag.py    From airflow with Apache License 2.0 6 votes vote down vote up
def test_remove_stale_dags(self):
        example_dags_list = list(self._write_example_dags().values())
        # Remove SubDags from the list as they are not stored in DB in a separate row
        # and are directly added in Json blob of the main DAG
        filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag]
        # Tests removing a stale DAG
        stale_dag = SDM(filtered_example_dags_list[0])
        fresh_dag = SDM(filtered_example_dags_list[1])
        # Overwrite stale_dag's last_updated to be 10 minutes ago
        stale_dag.last_updated = timezone.utcnow() - timezone.dt.timedelta(seconds=600)
        with create_session() as session:
            session.merge(stale_dag)
            session.commit()
        # Remove any stale DAGs older than 5 minutes
        SDM.remove_stale_dags(timezone.utcnow() - timezone.dt.timedelta(seconds=300))
        self.assertFalse(SDM.has_dag(stale_dag.dag_id))
        self.assertTrue(SDM.has_dag(fresh_dag.dag_id)) 
Example #7
Source File: test_datafusion.py    From airflow with Apache License 2.0 6 votes vote down vote up
def test_execute(self, mock_hook):
        mock_hook.return_value.get_instance.return_value = {"apiEndpoint": INSTANCE_URL}

        op = CloudDataFusionStartPipelineOperator(
            task_id="test_task",
            pipeline_name=PIPELINE_NAME,
            instance_name=INSTANCE_NAME,
            namespace=NAMESPACE,
            location=LOCATION,
            project_id=PROJECT_ID,
            runtime_args=RUNTIME_ARGS
        )
        op.dag = mock.MagicMock(spec=DAG, task_dict={}, dag_id="test")

        op.execute({})
        mock_hook.return_value.get_instance.assert_called_once_with(
            instance_name=INSTANCE_NAME, location=LOCATION, project_id=PROJECT_ID
        )

        mock_hook.return_value.start_pipeline.assert_called_once_with(
            instance_url=INSTANCE_URL,
            pipeline_name=PIPELINE_NAME,
            namespace=NAMESPACE,
            runtime_args=RUNTIME_ARGS,
        ) 
Example #8
Source File: test_fixtures.py    From dagster with Apache License 2.0 6 votes vote down vote up
def execute_tasks_in_dag(dag, tasks, run_id, execution_date):
    assert isinstance(dag, DAG)

    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter(LOG_FORMAT))
    root = logging.getLogger('airflow.task.operators')
    root.setLevel(logging.DEBUG)
    root.addHandler(handler)

    dag_run = dag.create_dagrun(run_id=run_id, state='success', execution_date=execution_date)

    results = {}
    for task in tasks:
        ti = TaskInstance(task=task, execution_date=execution_date)
        context = ti.get_template_context()
        context['dag_run'] = dag_run

        try:
            results[ti] = task.execute(context)
        except AirflowSkipException as exc:
            results[ti] = exc

    return results 
Example #9
Source File: dag_cycle_tester.py    From airflow with Apache License 2.0 6 votes vote down vote up
def test_cycle_arbitrary_loop(self):
        # test arbitrary loop
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # E-> A -> B -> F -> A
        #       -> C -> F
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='E')
            op5 = DummyOperator(task_id='F')
            op1.set_downstream(op2)
            op1.set_downstream(op3)
            op4.set_downstream(op1)
            op3.set_downstream(op5)
            op2.set_downstream(op5)
            op5.set_downstream(op1)

        with self.assertRaises(AirflowDagCycleException):
            self.assertFalse(test_cycle(dag)) 
Example #10
Source File: dag_cycle_tester.py    From airflow with Apache License 2.0 6 votes vote down vote up
def test_cycle_large_loop(self):
        # large loop
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> B -> C -> D -> E -> A
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='D')
            op5 = DummyOperator(task_id='E')
            op1.set_downstream(op2)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)
            op5.set_downstream(op1)

        with self.assertRaises(AirflowDagCycleException):
            self.assertFalse(test_cycle(dag)) 
Example #11
Source File: dag_cycle_tester.py    From airflow with Apache License 2.0 6 votes vote down vote up
def test_cycle_downstream_loop(self):
        # test downstream self loop
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> B -> C -> D -> E -> E
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='D')
            op5 = DummyOperator(task_id='E')
            op1.set_downstream(op2)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)
            op5.set_downstream(op5)

        with self.assertRaises(AirflowDagCycleException):
            self.assertFalse(test_cycle(dag)) 
Example #12
Source File: _airflow_op.py    From pipelines with Apache License 2.0 6 votes vote down vote up
def _run_airflow_op(Op, *op_args, **op_kwargs):
    from airflow.utils import db
    db.initdb()

    from datetime import datetime
    from airflow import DAG, settings
    from airflow.models import TaskInstance, Variable, XCom

    dag = DAG(dag_id='anydag', start_date=datetime.now())
    task = Op(*op_args, **op_kwargs, dag=dag, task_id='anytask')
    ti = TaskInstance(task=task, execution_date=datetime.now())
    result = task.execute(ti.get_template_context())

    variables = {var.id: var.val for var in settings.Session().query(Variable).all()}
    xcoms = {msg.key: msg.value for msg in settings.Session().query(XCom).all()}
    return (result, variables, xcoms) 
Example #13
Source File: cleveland_museum_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        start_date=datetime(2020, 1, 15),
        schedule_interval="@monthly",
        catchup=False
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, "Starting")
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, "Finished")

        start_task >> run_task >> end_task

    return dag 
Example #14
Source File: wikimedia_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        concurrency=3,
        max_active_runs=3,
        start_date=datetime(2003, 7, 1),
        schedule_interval='@daily',
        catchup=False,
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> run_task >> end_task

    return dag 
Example #15
Source File: test_qa_check_discovery.py    From FlowKit with Mozilla Public License 2.0 6 votes vote down vote up
def test_additional_checks_collected_in_subdirs(tmpdir):
    from airflow import DAG
    from flowetl.util import get_qa_checks

    Path(tmpdir / "qa_checks" / "calls").mkdir(parents=True)
    Path(tmpdir / "qa_checks" / "calls" / "DUMMY_CHECK.sql").touch()
    check_operators = get_qa_checks(
        dag=DAG("DUMMY_DAG", start_date=datetime.now(), template_searchpath=str(tmpdir))
    )

    assert len(check_operators) == len(qa_checks)

    check_operators = get_qa_checks(
        dag=DAG(
            "DUMMY_DAG",
            start_date=datetime.now(),
            template_searchpath=str(tmpdir),
            params=dict(cdr_type="calls"),
        ),
    )

    assert len(check_operators) > len(qa_checks) 
Example #16
Source File: metropolitan_museum_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        concurrency=1,
        max_active_runs=1,
        start_date=datetime(2020, 1, 1),
        schedule_interval='@daily',
        catchup=False,
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')

        start_task >> run_task >> end_task

    return dag 
Example #17
Source File: sync_commoncrawl_workflow.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        start_date=datetime(2020, 1, 15),
        schedule_interval="0 16 15 * *",
        catchup=False
    )

    with dag:
        start_task = get_log_operator(dag, DAG_ID, "Starting")
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, "Finished")

        start_task >> run_task >> end_task

    return dag 
Example #18
Source File: subdag.py    From airflow with Apache License 2.0 6 votes vote down vote up
def subdag(parent_dag_name, child_dag_name, args):
    """
    Generate a DAG to be used as a subdag.

    :param str parent_dag_name: Id of the parent DAG
    :param str child_dag_name: Id of the child DAG
    :param dict args: Default arguments to provide to the subdag
    :return: DAG to use as a subdag
    :rtype: airflow.models.DAG
    """
    dag_subdag = DAG(
        dag_id='%s.%s' % (parent_dag_name, child_dag_name),
        default_args=args,
        schedule_interval="@daily",
    )

    for i in range(5):
        DummyOperator(
            task_id='%s-task-%s' % (child_dag_name, i + 1),
            default_args=args,
            dag=dag_subdag,
        )

    return dag_subdag
# [END subdag] 
Example #19
Source File: fileflow_example.py    From fileflow with Apache License 2.0 6 votes vote down vote up
def run(self, *args, **kwargs):
        # This is how you read the output of a previous task
        # The argument to read_upstream_file is based on the DAG configuration
        input_string = self.read_upstream_file("something")

        # An example bit of 'logic'
        output_string = self.output_template.format(
            input_string,
            self.get_input_filename("something"),
            self.get_output_filename()
        )

        # And write out the results of the logic to the correct file
        self.write_file(output_string)

        logging.info(output_string)


# Now let's define a DAG 
Example #20
Source File: example_skip_dag.py    From airflow with Apache License 2.0 6 votes vote down vote up
def create_test_pipeline(suffix, trigger_rule, dag_):
    """
    Instantiate a number of operators for the given DAG.

    :param str suffix: Suffix to append to the operator task_ids
    :param str trigger_rule: TriggerRule for the join task
    :param DAG dag_: The DAG to run the operators on
    """
    skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag_)
    always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag_)
    join = DummyOperator(task_id=trigger_rule, dag=dag_, trigger_rule=trigger_rule)
    final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag_)

    skip_operator >> join
    always_true >> join
    join >> final 
Example #21
Source File: example_dingding.py    From airflow with Apache License 2.0 6 votes vote down vote up
def failure_callback(context):
    """
    The function that will be executed on failure.

    :param context: The context of the executed task.
    :type context: dict
    """
    message = 'AIRFLOW TASK FAILURE TIPS:\n' \
              'DAG:    {}\n' \
              'TASKS:  {}\n' \
              'Reason: {}\n' \
        .format(context['task_instance'].dag_id,
                context['task_instance'].task_id,
                context['exception'])
    return DingdingOperator(
        task_id='dingding_success_callback',
        dingding_conn_id='dingding_default',
        message_type='text',
        message=message,
        at_all=True,
    ).execute(context) 
Example #22
Source File: common_api_workflows.py    From cccatalog with MIT License 6 votes vote down vote up
def create_dag(
        source,
        script_location,
        dag_id,
        crontab_str=None,
        default_args=DAG_DEFAULT_ARGS):

    dag = DAG(
        dag_id=dag_id,
        default_args=default_args,
        schedule_interval=crontab_str,
        catchup=False
    )

    with dag:
        start_task = get_log_operator(dag, source, 'starting')
        run_task = get_runner_operator(dag, source, script_location)
        end_task = get_log_operator(dag, source, 'finished')

        start_task >> run_task >> end_task

    return dag 
Example #23
Source File: test_operator_util.py    From cccatalog with MIT License 5 votes vote down vote up
def test_get_runner_operator_creates_valid_string():
    dag = DAG(
        dag_id='test_dag',
        start_date=datetime.strptime('2019-01-01', '%Y-%m-%d')
    )
    runner = op_util.get_runner_operator(
        dag, 'test_source', '/test/script/location.py'
    )
    expected_command = 'python /test/script/location.py --mode default'
    assert runner.bash_command == expected_command 
Example #24
Source File: popularity_workflow.py    From cccatalog with MIT License 5 votes vote down vote up
def create_dag():
    dag = DAG(
        dag_id=DAG_ID,
        default_args=DAG_DEFAULT_ARGS,
        start_date=datetime(2020, 1, 1),
        schedule_interval='@monthly',
        catchup=False
    )
    with dag:
        start_task = get_log_operator(dag, DAG_ID, 'Starting')
        run_task = get_runner_operator(dag)
        end_task = get_log_operator(dag, DAG_ID, 'Finished')
        start_task >> run_task >> end_task
    return dag 
Example #25
Source File: S1_GRD_1SDV.py    From evo-odas with MIT License 5 votes vote down vote up
def prepare_band_paths(get_inputs_from, *args, **kwargs):
    """Get Product / Band files path Dictionary from ZipInspector and extract the list of band files """

    task_instance = kwargs['ti']

    # band  number from task name
    task_id = task_instance.task_id
    band_number = int(task_id.split('_')[-1])

    log.info("Getting inputs from: " + get_inputs_from)
    product_bands_dict = task_instance.xcom_pull(task_ids=get_inputs_from, key=XCOM_RETURN_KEY)
    if product_bands_dict is None:
        log.info("No input from ZipInspector. Nothing to do")
        return None

    log.info("Product Band Dictionary: {}".format(pprint.pformat(product_bands_dict)))

    files_path=[]
    for k in product_bands_dict:
        files_path += product_bands_dict[k]

    # Push one of the band paths to XCom
    file_path = files_path[band_number - 1]
    return [file_path]

# DAG definition 
Example #26
Source File: test_qa_check_discovery.py    From FlowKit with Mozilla Public License 2.0 5 votes vote down vote up
def test_additional_checks_collected(tmpdir):
    from airflow import DAG
    from flowetl.util import get_qa_checks

    Path(tmpdir / "qa_checks").mkdir()
    Path(tmpdir / "qa_checks" / "DUMMY_CHECK.sql").touch()
    check_operators = get_qa_checks(
        dag=DAG("DUMMY_DAG", start_date=datetime.now(), template_searchpath=str(tmpdir))
    )

    assert len(check_operators) > len(qa_checks) 
Example #27
Source File: test_good_dags.py    From airflow-declarative with Apache License 2.0 5 votes vote down vote up
def test_good_dags(path):
    dags = airflow_declarative.from_path(path)
    assert isinstance(dags, list)
    assert all(isinstance(dag, airflow.DAG) for dag in dags) 
Example #28
Source File: test_qa_check_discovery.py    From FlowKit with Mozilla Public License 2.0 5 votes vote down vote up
def test_default_qa_checks_found():
    from airflow import DAG
    from flowetl.util import get_qa_checks

    dag = DAG("DUMMY_DAG", start_date=datetime.now())
    check_operators = get_qa_checks(dag=dag)
    assert {op.task_id: op.sql for op in check_operators} == qa_checks 
Example #29
Source File: test_qa_check_discovery.py    From FlowKit with Mozilla Public License 2.0 5 votes vote down vote up
def test_name_suffix_added(tmpdir):
    from airflow import DAG
    from flowetl.util import get_qa_checks

    Path(tmpdir / "qa_checks" / "calls").mkdir(parents=True)
    Path(tmpdir / "qa_checks" / "calls" / "DUMMY_CHECK.sql").touch()
    check_operators = get_qa_checks(
        dag=DAG(
            "DUMMY_DAG",
            start_date=datetime.now(),
            template_searchpath=str(tmpdir),
            params=dict(cdr_type="calls"),
        )
    )
    assert any(op for op in check_operators if op.task_id == "DUMMY_CHECK.calls") 
Example #30
Source File: test_integer_callback_arg.py    From airflow-declarative with Apache License 2.0 5 votes vote down vote up
def test_integer_callback_arg(good_dag_path):
    path = good_dag_path("integer_callback_arg")
    dags = airflow_declarative.from_path(path)

    assert len(dags) == 1

    yml_dag = dags[0]

    assert isinstance(yml_dag, DAG)

    myoperator = yml_dag.task_dict["myoperator"]
    param = myoperator._callback_args["param"]
    assert isinstance(param, int)